From 6f6009d63e437a5c0bfc8d1e3b45bcaacdec02df Mon Sep 17 00:00:00 2001 From: jigoong Date: Mon, 2 Mar 2026 21:58:51 +0700 Subject: [PATCH] add superset airbyte setup and merge md file --- .env.global | 6 + 01-infra/init/02-create-airbyte-databases.sql | 14 ++ 04-ingestion/.airbyte.env | 54 ++++ 04-ingestion/.gitignore | 19 ++ 04-ingestion/ARCHITECTURE.md | 169 +++++++++++++ 04-ingestion/NGINX-SETUP.md | 183 ++++++++++++++ 04-ingestion/QUICKSTART.md | 187 ++++++++++++++ 04-ingestion/README.md | 237 ++++++++++++++++-- 04-ingestion/setup-airbyte.sh | 228 +++++++++++++++++ 04-ingestion/start-airbyte.sh | 31 +++ 04-ingestion/stop-airbyte.sh | 29 +++ 04-ingestion/uninstall-airbyte.sh | 54 ++++ 06-analytics/Dockerfile | 15 ++ 06-analytics/docker-compose.yml | 11 +- 06-analytics/superset_config.py | 2 +- 15 files changed, 1220 insertions(+), 19 deletions(-) create mode 100644 01-infra/init/02-create-airbyte-databases.sql create mode 100644 04-ingestion/.airbyte.env create mode 100644 04-ingestion/.gitignore create mode 100644 04-ingestion/ARCHITECTURE.md create mode 100644 04-ingestion/NGINX-SETUP.md create mode 100644 04-ingestion/QUICKSTART.md create mode 100644 04-ingestion/setup-airbyte.sh create mode 100644 04-ingestion/start-airbyte.sh create mode 100644 04-ingestion/stop-airbyte.sh create mode 100644 04-ingestion/uninstall-airbyte.sh create mode 100644 06-analytics/Dockerfile diff --git a/.env.global b/.env.global index ce2a3f8..a8d2678 100644 --- a/.env.global +++ b/.env.global @@ -23,3 +23,9 @@ APP_NAME=APIsService ADMIN_SECRET_KEY=apiservice_admin_secret_2026 ADMIN_USERNAME=admin ADMIN_PASSWORD=change_me_2026 + +# Airbyte Configuration +AIRBYTE_PORT=8030 +AIRBYTE_BASIC_AUTH_USERNAME= +AIRBYTE_BASIC_AUTH_PASSWORD= +AIRBYTE_BASIC_AUTH_PROXY_TIMEOUT=900 diff --git a/01-infra/init/02-create-airbyte-databases.sql b/01-infra/init/02-create-airbyte-databases.sql new file mode 100644 index 0000000..2a6926e --- /dev/null +++ b/01-infra/init/02-create-airbyte-databases.sql @@ -0,0 +1,14 @@ +-- Create databases for Airbyte OSS +-- These databases will be used by the Airbyte deployment in 04-ingestion + +-- Main Airbyte database +CREATE DATABASE airbyte; + +-- Temporal workflow engine databases +CREATE DATABASE temporal; +CREATE DATABASE temporal_visibility; + +-- Grant permissions to postgres user +GRANT ALL PRIVILEGES ON DATABASE airbyte TO postgres; +GRANT ALL PRIVILEGES ON DATABASE temporal TO postgres; +GRANT ALL PRIVILEGES ON DATABASE temporal_visibility TO postgres; diff --git a/04-ingestion/.airbyte.env b/04-ingestion/.airbyte.env new file mode 100644 index 0000000..4a3e414 --- /dev/null +++ b/04-ingestion/.airbyte.env @@ -0,0 +1,54 @@ +# Airbyte Configuration +# This file contains environment-specific settings for Airbyte deployment + +# Airbyte Version (using latest stable - will be determined by abctl) +AIRBYTE_VERSION=latest + +# Database Configuration (using shared PostgreSQL from 01-infra) +DATABASE_HOST=postgres +DATABASE_PORT=5432 +DATABASE_USER=postgres +DATABASE_PASSWORD=Secure_Hospital_Pass_2026 +DATABASE_DB=airbyte + +# Temporal Database Configuration +TEMPORAL_DB=temporal +TEMPORAL_VISIBILITY_DB=temporal_visibility + +# Port Configuration +# Using 8030 instead of default 8000 to avoid conflicts +AIRBYTE_PORT=8030 + +# Authentication +# Using Nginx Proxy Manager for authentication (Keycloak not natively supported by Airbyte) +# Basic auth disabled - authentication handled by nginx proxy +BASIC_AUTH_USERNAME= +BASIC_AUTH_PASSWORD= +BASIC_AUTH_PROXY_TIMEOUT=900 + +# Deployment Settings +# Enable low-resource mode for environments with <4 CPU cores +LOW_RESOURCE_MODE=true + +# Host Configuration +# Domain: ai.sriphat.com/airbyte (configured via nginx proxy) +# Local access: IP:8030 +AIRBYTE_HOST=ai.sriphat.com + +# Set to true if running over HTTP behind nginx proxy +INSECURE_COOKIES=true + +# Backup Configuration +ENABLE_BACKUP=true +BACKUP_SCHEDULE="0 2 * * *" # Daily at 2 AM + +# Workspace and Data Directories +WORKSPACE_ROOT=/workspace +CONFIG_ROOT=/data +LOCAL_ROOT=/local_root + +# Logging +LOG_LEVEL=INFO + +# Timezone +TZ=Asia/Bangkok diff --git a/04-ingestion/.gitignore b/04-ingestion/.gitignore new file mode 100644 index 0000000..84acff8 --- /dev/null +++ b/04-ingestion/.gitignore @@ -0,0 +1,19 @@ +# Airbyte data directories +data/ +workspace/ +config/ +local_root/ + +# abctl generated files +.abctl/ + +# Backups +backups/ +*.sql +*.tar.gz + +# Environment files with sensitive data +.env.local + +# Logs +*.log diff --git a/04-ingestion/ARCHITECTURE.md b/04-ingestion/ARCHITECTURE.md new file mode 100644 index 0000000..b055194 --- /dev/null +++ b/04-ingestion/ARCHITECTURE.md @@ -0,0 +1,169 @@ +# Airbyte Network Architecture + +## Overview + +Airbyte deployment uses the **existing** Nginx Proxy Manager from `01-infra`. No additional nginx is needed in `04-ingestion`. + +## Network Flow + +``` +Internet (HTTPS) + ↓ +Nginx Proxy Manager (01-infra) + - Container: nginx-proxy-manager + - Ports: 80, 443, 8021 (admin) + - Network: shared_data_network + ↓ +airbyte-proxy (deployed by abctl) + - Container: airbyte-proxy + - Internal Port: 8000 + - External Port: 8030 (mapped) + - Network: shared_data_network + ↓ +Airbyte Services + - airbyte-server + - airbyte-worker + - airbyte-webapp + - airbyte-temporal + - etc. +``` + +## Access Methods + +### 1. Production (via Domain) +``` +https://ai.sriphat.com/airbyte + ↓ +Nginx Proxy Manager (01-infra) + ↓ +airbyte-proxy:8000 (internal) + ↓ +Airbyte Services +``` + +### 2. Local/Development +``` +http://localhost:8030 + ↓ +airbyte-proxy:8030 (port mapping) + ↓ +Airbyte Services +``` + +### 3. Direct IP Access +``` +http://[SERVER_IP]:8030 + ↓ +airbyte-proxy:8030 (port mapping) + ↓ +Airbyte Services +``` + +## Components + +### 01-infra (Shared Infrastructure) +- **Nginx Proxy Manager**: External reverse proxy + - Handles SSL/TLS termination + - Routes traffic to backend services + - Manages authentication (OAuth2/Basic Auth) + - Domain: ai.sriphat.com + +- **PostgreSQL**: Shared database + - Databases: `airbyte`, `temporal`, `temporal_visibility` + - Used by Airbyte for metadata storage + +- **Keycloak**: Identity provider (optional) + - Can be integrated via OAuth2 Proxy + - Provides SSO for all services + +### 04-ingestion (Airbyte) +- **airbyte-proxy**: Internal nginx (deployed by abctl) + - Routes between Airbyte microservices + - NOT for external access + - Listens on port 8000 (internal), 8030 (external) + +- **Airbyte Services**: Deployed by abctl + - All services connect to `shared_data_network` + - Communicate with PostgreSQL and each other + +## Network Configuration + +### shared_data_network +All services connect to this Docker network: +- nginx-proxy-manager (01-infra) +- postgres (01-infra) +- keycloak (01-infra) +- airbyte-proxy (04-ingestion) +- airbyte-server (04-ingestion) +- airbyte-worker (04-ingestion) +- airbyte-webapp (04-ingestion) +- airbyte-temporal (04-ingestion) +- etc. + +### Port Mappings + +**External Ports:** +- 80, 443: Nginx Proxy Manager (HTTPS) +- 8021: Nginx Proxy Manager Admin UI +- 8030: Airbyte (direct access, optional) +- 5435: PostgreSQL (external access) + +**Internal Ports:** +- 8000: airbyte-proxy (accessed by Nginx Proxy Manager) +- 5432: postgres (internal network only) +- 8080: keycloak (internal network only) + +## Why No Additional Nginx? + +1. **abctl deploys airbyte-proxy**: This is Airbyte's internal nginx for routing between microservices +2. **Nginx Proxy Manager exists**: Already running in `01-infra` for external access +3. **Shared network**: Both can communicate via `shared_data_network` +4. **Single point of entry**: Nginx Proxy Manager handles all external traffic + +## Configuration Steps + +1. **Deploy Infrastructure** (01-infra) + ```bash + cd 01-infra + docker compose --env-file ../.env.global up -d + ``` + +2. **Deploy Airbyte** (04-ingestion) + ```bash + cd 04-ingestion + bash setup-airbyte.sh + ``` + - This deploys airbyte-proxy automatically + - Connects to shared_data_network + - Uses shared PostgreSQL + +3. **Configure Nginx Proxy Manager** + - Add proxy host for `ai.sriphat.com` + - Forward to `airbyte-proxy:8000` + - Enable SSL + - Add authentication (optional) + +## Security Layers + +1. **SSL/TLS**: Nginx Proxy Manager (Let's Encrypt) +2. **Authentication**: OAuth2 Proxy + Keycloak OR Basic Auth +3. **Network Isolation**: Docker network (shared_data_network) +4. **Firewall**: Only expose necessary ports + +## Troubleshooting + +### Cannot access via domain +- Check Nginx Proxy Manager is running +- Verify proxy host configuration +- Check DNS points to server +- Verify SSL certificate + +### Cannot access locally +- Check airbyte-proxy is running: `docker ps | grep airbyte-proxy` +- Verify port 8030 is mapped +- Check firewall allows port 8030 + +### Services cannot communicate +- Verify all containers on `shared_data_network` +- Check network: `docker network inspect shared_data_network` +- Verify container names resolve (postgres, airbyte-proxy, etc.) diff --git a/04-ingestion/NGINX-SETUP.md b/04-ingestion/NGINX-SETUP.md new file mode 100644 index 0000000..ca9496c --- /dev/null +++ b/04-ingestion/NGINX-SETUP.md @@ -0,0 +1,183 @@ +# Nginx Proxy Manager Configuration for Airbyte + +## Overview + +This guide explains how to configure Nginx Proxy Manager to expose Airbyte at `https://ai.sriphat.com/airbyte` with optional Keycloak authentication. + +## Prerequisites + +- Airbyte installed and running (port 8030) +- Nginx Proxy Manager running (port 8021 for admin) +- Domain `ai.sriphat.com` pointing to your server +- SSL certificate (Let's Encrypt recommended) + +## Step 1: Access Nginx Proxy Manager + +1. Open browser: `http://localhost:8021` +2. Login with admin credentials (from `.env.global`) + +## Step 2: Add Proxy Host + +### Basic Configuration + +1. Click **"Proxy Hosts"** → **"Add Proxy Host"** + +2. **Details Tab:** + - Domain Names: `ai.sriphat.com` + - Scheme: `http` + - Forward Hostname/IP: `airbyte-proxy` + - Forward Port: `8000` + - Cache Assets: ✓ (enabled) + - Block Common Exploits: ✓ (enabled) + - Websockets Support: ✓ (enabled) + +3. **Custom Locations Tab:** + - Click **"Add Location"** + - Location: `/airbyte` + - Scheme: `http` + - Forward Hostname/IP: `airbyte-proxy` + - Forward Port: `8000` + - Custom Config: + ```nginx + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header Host $host; + + # Remove /airbyte prefix when forwarding + rewrite ^/airbyte/(.*) /$1 break; + ``` + +4. **SSL Tab:** + - SSL Certificate: Select existing or create new Let's Encrypt + - Force SSL: ✓ (enabled) + - HTTP/2 Support: ✓ (enabled) + - HSTS Enabled: ✓ (enabled) + +5. Click **"Save"** + +## Step 3: Configure Keycloak Authentication (Optional) + +Since Airbyte doesn't natively support Keycloak, we'll use nginx authentication. + +### Option A: OAuth2 Proxy with Keycloak + +1. Deploy OAuth2 Proxy container: + ```bash + docker run -d \ + --name oauth2-proxy \ + --network shared_data_network \ + -p 4180:4180 \ + quay.io/oauth2-proxy/oauth2-proxy:latest \ + --provider=keycloak-oidc \ + --client-id=airbyte \ + --client-secret=YOUR_CLIENT_SECRET \ + --redirect-url=https://ai.sriphat.com/oauth2/callback \ + --oidc-issuer-url=https://ai.sriphat.com/keycloak/realms/master \ + --cookie-secret=RANDOM_SECRET_32_CHARS \ + --email-domain=* \ + --upstream=http://airbyte-proxy:8000 + ``` + +2. Update Nginx Proxy Host Custom Config: + ```nginx + # OAuth2 authentication + auth_request /oauth2/auth; + error_page 401 = /oauth2/sign_in; + + # Pass auth headers + auth_request_set $user $upstream_http_x_auth_request_user; + auth_request_set $email $upstream_http_x_auth_request_email; + proxy_set_header X-User $user; + proxy_set_header X-Email $email; + + # OAuth2 proxy location + location /oauth2/ { + proxy_pass http://oauth2-proxy:4180; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Scheme $scheme; + } + ``` + +### Option B: Basic Authentication (Simpler) + +1. In Nginx Proxy Manager, go to **Access Lists** +2. Create new Access List: + - Name: `Airbyte Access` + - Satisfy Any: ✓ + - Add users with username/password +3. Apply Access List to Airbyte Proxy Host + +### Option C: IP Whitelist + +1. In Nginx Proxy Manager Access List +2. Add allowed IP addresses: + - Internal network: `192.168.0.0/16` + - VPN range: `10.0.0.0/8` + - Specific IPs as needed + +## Step 4: Keycloak Client Setup (for OAuth2 Proxy) + +1. Login to Keycloak: `http://localhost:8080` +2. Select realm (or create new) +3. Go to **Clients** → **Create** +4. Client Configuration: + - Client ID: `airbyte` + - Client Protocol: `openid-connect` + - Access Type: `confidential` + - Valid Redirect URIs: `https://ai.sriphat.com/oauth2/callback` + - Web Origins: `https://ai.sriphat.com` +5. Save and copy **Client Secret** from Credentials tab + +## Step 5: Test Configuration + +1. Access Airbyte: + - External: `https://ai.sriphat.com/airbyte` + - Local: `http://localhost:8030` + +2. Verify: + - SSL certificate is valid + - Authentication works (if enabled) + - Websockets work (for real-time updates) + - No CORS errors in browser console + +## Troubleshooting + +### 502 Bad Gateway +- Check if `airbyte-proxy` container is running +- Verify network connectivity: `docker network inspect shared_data_network` +- Check logs: `docker logs airbyte-proxy` + +### Authentication Loop +- Clear browser cookies +- Verify OAuth2 Proxy configuration +- Check Keycloak client settings + +### WebSocket Errors +- Ensure "Websockets Support" is enabled in nginx +- Check browser console for connection errors +- Verify proxy headers are set correctly + +### SSL Certificate Issues +- Use Let's Encrypt for automatic renewal +- Ensure domain DNS points to server +- Check firewall allows ports 80 and 443 + +## Security Recommendations + +1. **Always use HTTPS** in production +2. **Enable authentication** (OAuth2 or Basic Auth) +3. **Whitelist IPs** if possible +4. **Enable rate limiting** in nginx +5. **Regular security updates** for all components +6. **Monitor access logs** for suspicious activity + +## Alternative: Direct Access + +For development or internal use, access directly: +``` +http://[SERVER_IP]:8030 +``` + +No authentication required, but only accessible from local network. diff --git a/04-ingestion/QUICKSTART.md b/04-ingestion/QUICKSTART.md new file mode 100644 index 0000000..c798ea9 --- /dev/null +++ b/04-ingestion/QUICKSTART.md @@ -0,0 +1,187 @@ +# Airbyte Quick Start Guide + +## สถาปัตยกรรม + +Airbyte ใช้ทรัพยากรร่วมจาก `01-infra`: +- ✅ **PostgreSQL**: ฐานข้อมูลร่วม +- ✅ **Nginx Proxy Manager**: Reverse proxy สำหรับเข้าถึงจากภายนอก +- ✅ **Network**: `shared_data_network` + +**หมายเหตุสำคัญ**: `abctl` จะสร้าง `airbyte-proxy` (nginx ภายใน) สำหรับ routing ระหว่าง microservices ของ Airbyte เท่านั้น การเข้าถึงจากภายนอกใช้ Nginx Proxy Manager ที่มีอยู่แล้วใน `01-infra` **ไม่ต้องติดตั้ง nginx เพิ่มในโฟลเดอร์นี้** + +## วิธีติดตั้ง Airbyte OSS ด้วย abctl + +### ขั้นตอนที่ 1: เตรียม Infrastructure + +ตรวจสอบว่า PostgreSQL กำลังทำงาน: + +```bash +cd ../01-infra +docker compose --env-file ../.env.global up -d +``` + +รอจนกว่า PostgreSQL จะพร้อม (ประมาณ 10-30 วินาที) + +### ขั้นตอนที่ 2: ติดตั้ง Airbyte + +```bash +cd ../04-ingestion + +# สำหรับ Linux/macOS +bash setup-airbyte.sh + +# สำหรับ Windows (ใน Git Bash หรือ WSL) +bash setup-airbyte.sh +``` + +การติดตั้งจะใช้เวลา 10-30 นาที ขึ้นอยู่กับความเร็วอินเทอร์เน็ต + +### ขั้นตอนที่ 3: เข้าใช้งาน + +**สำหรับ Production (ผ่าน Domain):** +``` +https://ai.sriphat.com/airbyte +``` +*หมายเหตุ: ต้องตั้งค่า Nginx Proxy Manager ก่อน (ดู NGINX-SETUP.md)* + +**สำหรับ Development/Local:** +``` +http://localhost:8030 +หรือ +http://[IP-ของเซิร์ฟเวอร์]:8030 +``` + +กรอกข้อมูล: +- Email: อีเมลของคุณ +- Organization name: ชื่อองค์กร (เช่น Sriphat Hospital) + +### การจัดการ Airbyte + +**เริ่มต้น Airbyte:** +```bash +bash start-airbyte.sh +``` + +**หยุด Airbyte:** +```bash +bash stop-airbyte.sh +``` + +**ถอนการติดตั้ง:** +```bash +bash uninstall-airbyte.sh +``` + +## การตั้งค่าเพิ่มเติม + +### การตั้งค่าที่สำคัญ + +แก้ไขไฟล์ `.airbyte.env`: + +**Low Resource Mode (เปิดใช้งานอยู่แล้ว):** +```bash +LOW_RESOURCE_MODE=true # เหมาะสำหรับเครื่องที่มี CPU < 4 cores +``` +*หมายเหตุ: โหมดนี้จะปิดการใช้งาน Connector Builder* + +**เปลี่ยนพอร์ต:** +```bash +AIRBYTE_PORT=8030 # เปลี่ยนเป็นพอร์ตที่ต้องการ +``` + +**ตั้งค่า Domain:** +```bash +AIRBYTE_HOST=ai.sriphat.com # Domain สำหรับเข้าถึงจากภายนอก +``` + +**เปิดใช้งาน Backup:** +```bash +ENABLE_BACKUP=true # สร้าง backup script อัตโนมัติ +BACKUP_SCHEDULE="0 2 * * *" # ทุกวันเวลา 02:00 น. +``` + +### Authentication (ผ่าน Nginx Proxy Manager) + +Airbyte ไม่รองรับ Keycloak โดยตรง ใช้ Nginx Proxy Manager แทน: + +1. **OAuth2 Proxy + Keycloak** (แนะนำ) - ดู NGINX-SETUP.md +2. **Basic Authentication** - ตั้งค่าใน Nginx Access List +3. **IP Whitelist** - จำกัดการเข้าถึงตาม IP + +## การแก้ปัญหา + +### ติดตั้งไม่สำเร็จ + +1. ตรวจสอบว่า Docker กำลังทำงาน +2. ตรวจสอบว่า PostgreSQL กำลังทำงาน: `docker ps | grep postgres` +3. ตรวจสอบ logs: `abctl local logs` + +### เข้า UI ไม่ได้ + +1. ตรวจสอบสถานะ: `abctl local status` +2. ตรวจสอบ containers: `docker ps | grep airbyte` +3. รีสตาร์ท: `bash stop-airbyte.sh && bash start-airbyte.sh` + +### ติดตั้ง abctl บน Windows + +ดาวน์โหลดจาก: https://github.com/airbytehq/abctl/releases + +## ฐานข้อมูล + +Airbyte ใช้ PostgreSQL ร่วมกับ infrastructure: +- Database: `airbyte`, `temporal`, `temporal_visibility` +- Host: `postgres` (container name) +- Port: 5432 +- User/Password: ตามที่กำหนดใน `.env.global` + +ข้อมูลจะถูกสำรองอัตโนมัติพร้อมกับ PostgreSQL backup + +## Backup และ Restore + +### สำรองข้อมูล + +**Manual Backup:** +```bash +./backup-airbyte.sh +``` + +**ตั้งค่า Auto Backup (crontab):** +```bash +crontab -e +# เพิ่มบรรทัดนี้สำหรับ backup ทุกวันเวลา 02:00 น. +0 2 * * * cd /path/to/04-ingestion && ./backup-airbyte.sh +``` + +**ตำแหน่งไฟล์ Backup:** +- โฟลเดอร์: `./backups/` +- รูปแบบ: `airbyte_backup_YYYYMMDD_HHMMSS.tar.gz` +- เก็บไว้: 7 วันล่าสุด (ลบอัตโนมัติ) + +### Restore ข้อมูล + +```bash +# แตกไฟล์ backup +tar -xzf backups/airbyte_backup_20260227_020000.tar.gz + +# Restore databases +docker exec -i postgres psql -U postgres airbyte < airbyte_20260227_020000.sql +docker exec -i postgres psql -U postgres temporal < temporal_20260227_020000.sql +docker exec -i postgres psql -U postgres temporal_visibility < temporal_visibility_20260227_020000.sql +``` + +## ข้อมูลเพิ่มเติม + +### การตั้งค่า Nginx Proxy Manager +ดูคู่มือโดยละเอียดที่: `NGINX-SETUP.md` + +### ทรัพยากรเพิ่มเติม +- [Airbyte Documentation](https://docs.airbyte.com/) +- [Connector Catalog](https://docs.airbyte.com/integrations/) +- [Community Support](https://airbyte.com/community) + +### สถานะการติดตั้ง +- ✅ Low Resource Mode: เปิดใช้งาน +- ✅ Latest Stable Version: ใช้เวอร์ชันล่าสุด +- ✅ Domain Access: ai.sriphat.com/airbyte +- ✅ Backup: เปิดใช้งานอัตโนมัติ +- ✅ Authentication: ผ่าน Nginx Proxy Manager diff --git a/04-ingestion/README.md b/04-ingestion/README.md index ae308d7..333e92c 100644 --- a/04-ingestion/README.md +++ b/04-ingestion/README.md @@ -1,30 +1,235 @@ # 04-ingestion: Airbyte Data Ingestion -Airbyte OSS for data ingestion and ETL (multi-container deployment). +Airbyte OSS for data ingestion and ETL using `abctl` CLI tool. -## Services +## Overview -- **airbyte-proxy**: Public entrypoint (UI/API gateway) -- **server**: Airbyte backend -- **worker**: Runs sync jobs and launches connector containers -- **webapp**: Airbyte UI -- **airbyte-temporal**: Workflow engine +This deployment uses Airbyte's official `abctl` command-line tool for easy installation and management. It's configured to use shared infrastructure from `01-infra`: -## Run +- **PostgreSQL**: Shared database for Airbyte metadata +- **Nginx Proxy Manager**: Shared reverse proxy for external access +- **Network**: `shared_data_network` for inter-service communication + +**Note**: `abctl` deploys an internal `airbyte-proxy` container for routing between Airbyte microservices. External access is handled by the existing Nginx Proxy Manager in `01-infra` - no additional nginx needed in this folder. + +## Prerequisites + +1. Docker Desktop installed and running +2. Infrastructure services running (PostgreSQL from `01-infra`) +3. Linux or macOS (for Windows, install abctl manually) + +## Installation + +### First Time Setup + +Run the automated setup script: ```bash -docker compose --env-file ../.env.global up -d +cd 04-ingestion +chmod +x *.sh +./setup-airbyte.sh +``` + +This script will: +- Check prerequisites (Docker, PostgreSQL) +- Install `abctl` if not present +- Create required databases (airbyte, temporal, temporal_visibility) +- Install Airbyte with custom configuration +- Configure port mapping (8030 instead of default 8000) + +Installation takes approximately 10-30 minutes depending on internet speed. + +### Manual Installation + +If you prefer manual installation: + +1. Install abctl: + ```bash + curl -LsfS https://get.airbyte.com | bash - + ``` + +2. Create databases: + ```bash + docker exec postgres psql -U postgres -c "CREATE DATABASE airbyte;" + docker exec postgres psql -U postgres -c "CREATE DATABASE temporal;" + docker exec postgres psql -U postgres -c "CREATE DATABASE temporal_visibility;" + ``` + +3. Install Airbyte: + ```bash + abctl local install --port 8030 --insecure-cookies + ``` + +## Usage + +### Start Airbyte +```bash +./start-airbyte.sh +``` + +### Stop Airbyte +```bash +./stop-airbyte.sh +``` + +### Uninstall Airbyte +```bash +./uninstall-airbyte.sh ``` ## Access -- Web UI: http://localhost:8000 -- Configure in Nginx to route domain to `airbyte-proxy:8000` +### Production (via Nginx Proxy Manager) +- **Domain**: https://ai.sriphat.com/airbyte +- **Authentication**: Configured via Nginx (see NGINX-SETUP.md) +- **SSL**: Enabled with Let's Encrypt -## Note +### Development/Local +- **Localhost**: http://localhost:8030 +- **Direct IP**: http://[SERVER_IP]:8030 +- **No authentication** required for local access -This deployment pins Airbyte images to avoid `:latest` tag issues. +## Configuration -## First Time Setup -1. Create database: `docker exec postgres psql -U postgres -c "CREATE DATABASE airbyte;"` -2. Access webapp and configure sources/destinations +Edit `.airbyte.env` to customize: +- `AIRBYTE_PORT`: External port (default: 8030) +- `AIRBYTE_HOST`: Domain name for external access (ai.sriphat.com) +- `LOW_RESOURCE_MODE`: **Enabled by default** for systems with <4 CPU cores +- `AIRBYTE_VERSION`: Uses latest stable version +- `ENABLE_BACKUP`: Automated backup configuration (enabled) +- Database connection settings (uses shared PostgreSQL) + +### Authentication + +Airbyte does not natively support Keycloak. Authentication is handled via **Nginx Proxy Manager**: + +1. **Recommended**: OAuth2 Proxy with Keycloak integration +2. **Alternative**: Basic Authentication via nginx +3. **Simple**: IP whitelist for internal access + +See `NGINX-SETUP.md` for detailed configuration instructions. + +## Database + +Airbyte uses three databases in the shared PostgreSQL instance: +- `airbyte`: Main application database +- `temporal`: Workflow engine database +- `temporal_visibility`: Temporal visibility database + +All databases are automatically created during setup and backed up with the main PostgreSQL instance. + +## Architecture + +### Services Deployed by abctl + +- **airbyte-server**: Backend API and business logic +- **airbyte-worker**: Executes sync jobs and manages connectors +- **airbyte-webapp**: Web UI +- **airbyte-temporal**: Workflow orchestration engine +- **airbyte-proxy**: Nginx reverse proxy (public entrypoint) +- **airbyte-cron**: Scheduled job runner +- **airbyte-connector-builder-server**: Custom connector development +- **airbyte-api-server**: REST API server + +### Network Architecture + +**All services connect to `shared_data_network`:** + +``` +Internet → Nginx Proxy Manager (01-infra) → airbyte-proxy (internal) → Airbyte Services + ai.sriphat.com/airbyte port 8000 +``` + +**Shared Resources from 01-infra:** +- **Nginx Proxy Manager**: External reverse proxy (handles SSL, auth, routing) +- **PostgreSQL**: Database server (airbyte, temporal, temporal_visibility) +- **Keycloak**: Identity provider (optional, via OAuth2 Proxy) + +**Airbyte Components (deployed by abctl):** +- **airbyte-proxy**: Internal nginx for microservice routing (NOT for external access) +- **Airbyte services**: server, worker, webapp, temporal, etc. + +See `ARCHITECTURE.md` for detailed network flow diagram. + +## Troubleshooting + +### Installation Issues + +**Error: "Readiness probe failed: HTTP probe failed with statuscode: 503"** +- This is normal during installation. Allow installation to continue. +- May need to allocate more resources to Docker Desktop. + +**Error: "PostgreSQL container is not running"** +- Start infrastructure first: `cd ../01-infra && docker compose --env-file ../.env.global up -d` + +**Error: "abctl: command not found"** +- The setup script will install it automatically on Linux/macOS +- For Windows, download from: https://github.com/airbytehq/abctl/releases + +### Runtime Issues + +**Cannot access UI at localhost:8030** +- Check if Airbyte is running: `abctl local status` +- Check Docker containers: `docker ps | grep airbyte` +- View logs: `abctl local logs` + +**Sync jobs failing** +- Check worker logs: `docker logs airbyte-worker` +- Verify database connectivity +- Ensure sufficient disk space and memory + +**Low resource environments** +- Enable low-resource mode in `.airbyte.env`: `LOW_RESOURCE_MODE=true` +- Note: Connector Builder will be disabled in low-resource mode + +## Upgrading + +To upgrade Airbyte to a newer version: + +```bash +abctl local upgrade +``` + +## Backup + +### Automated Backups + +The setup script creates `backup-airbyte.sh` which backs up all Airbyte databases: +- `airbyte` - Main application database +- `temporal` - Workflow engine database +- `temporal_visibility` - Temporal visibility database + +**Manual Backup:** +```bash +./backup-airbyte.sh +``` + +**Automated Schedule:** +Add to crontab for daily backups at 2 AM: +```bash +crontab -e +# Add this line: +0 2 * * * cd /path/to/04-ingestion && ./backup-airbyte.sh +``` + +**Backup Location:** +- Directory: `./backups/` +- Format: `airbyte_backup_YYYYMMDD_HHMMSS.tar.gz` +- Retention: Last 7 days (older backups auto-deleted) + +**Restore from Backup:** +```bash +# Extract backup +tar -xzf backups/airbyte_backup_20260227_020000.tar.gz + +# Restore databases +docker exec -i postgres psql -U postgres airbyte < airbyte_20260227_020000.sql +docker exec -i postgres psql -U postgres temporal < temporal_20260227_020000.sql +docker exec -i postgres psql -U postgres temporal_visibility < temporal_visibility_20260227_020000.sql +``` + +## Additional Resources + +- [Airbyte Documentation](https://docs.airbyte.com/) +- [Airbyte OSS Quickstart](https://docs.airbyte.com/platform/using-airbyte/getting-started/oss-quickstart) +- [abctl CLI Reference](https://docs.airbyte.com/platform/deploying-airbyte/abctl) diff --git a/04-ingestion/setup-airbyte.sh b/04-ingestion/setup-airbyte.sh new file mode 100644 index 0000000..fb3d777 --- /dev/null +++ b/04-ingestion/setup-airbyte.sh @@ -0,0 +1,228 @@ +#!/bin/bash + +# Airbyte OSS Setup Script +# This script automates the installation of Airbyte using abctl +# with configuration to use shared PostgreSQL from 01-infra + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +ENV_FILE="$SCRIPT_DIR/.airbyte.env" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Load configuration +if [ -f "$ENV_FILE" ]; then + source "$ENV_FILE" +else + echo -e "${RED}Error: Configuration file .airbyte.env not found${NC}" + exit 1 +fi + +echo -e "${GREEN}=== Airbyte OSS Setup ===${NC}" +echo "This script will install Airbyte using abctl with shared PostgreSQL" +echo "" + +# Check prerequisites +echo -e "${YELLOW}Checking prerequisites...${NC}" + +# Check Docker +if ! command -v docker &> /dev/null; then + echo -e "${RED}Error: Docker is not installed${NC}" + echo "Please install Docker Desktop first" + exit 1 +fi + +# Check if Docker is running +if ! docker info &> /dev/null; then + echo -e "${RED}Error: Docker is not running${NC}" + echo "Please start Docker Desktop" + exit 1 +fi + +echo -e "${GREEN}✓ Docker is installed and running${NC}" + +# Check if PostgreSQL container is running +if ! docker ps | grep -q "postgres"; then + echo -e "${RED}Error: PostgreSQL container is not running${NC}" + echo "Please start the infrastructure first:" + echo " cd $PROJECT_ROOT/01-infra" + echo " docker compose --env-file ../.env.global up -d" + exit 1 +fi + +echo -e "${GREEN}✓ PostgreSQL container is running${NC}" + +# Install abctl if not present +echo -e "${YELLOW}Checking for abctl...${NC}" +if ! command -v abctl &> /dev/null; then + echo "abctl not found. Installing..." + + # Detect OS + OS="$(uname -s)" + case "${OS}" in + Linux*) + curl -LsfS https://get.airbyte.com | bash - + ;; + Darwin*) + curl -LsfS https://get.airbyte.com | bash - + ;; + MINGW*|MSYS*|CYGWIN*) + echo -e "${RED}Windows detected. Please install abctl manually:${NC}" + echo "Download from: https://github.com/airbytehq/abctl/releases" + exit 1 + ;; + *) + echo -e "${RED}Unsupported OS: ${OS}${NC}" + exit 1 + ;; + esac + + # Verify installation + if ! command -v abctl &> /dev/null; then + echo -e "${RED}Failed to install abctl${NC}" + exit 1 + fi +fi + +echo -e "${GREEN}✓ abctl is installed${NC}" + +# Create databases in PostgreSQL +echo -e "${YELLOW}Creating Airbyte databases...${NC}" + +# Check if databases already exist +DB_EXISTS=$(docker exec postgres psql -U postgres -tAc "SELECT 1 FROM pg_database WHERE datname='airbyte'" 2>/dev/null || echo "0") + +if [ "$DB_EXISTS" = "1" ]; then + echo -e "${YELLOW}Airbyte databases already exist. Skipping creation.${NC}" +else + echo "Creating airbyte, temporal, and temporal_visibility databases..." + docker exec postgres psql -U postgres -c "CREATE DATABASE airbyte;" 2>/dev/null || true + docker exec postgres psql -U postgres -c "CREATE DATABASE temporal;" 2>/dev/null || true + docker exec postgres psql -U postgres -c "CREATE DATABASE temporal_visibility;" 2>/dev/null || true + docker exec postgres psql -U postgres -c "GRANT ALL PRIVILEGES ON DATABASE airbyte TO postgres;" 2>/dev/null || true + docker exec postgres psql -U postgres -c "GRANT ALL PRIVILEGES ON DATABASE temporal TO postgres;" 2>/dev/null || true + docker exec postgres psql -U postgres -c "GRANT ALL PRIVILEGES ON DATABASE temporal_visibility TO postgres;" 2>/dev/null || true + echo -e "${GREEN}✓ Databases created${NC}" +fi + +# Prepare abctl install command +echo -e "${YELLOW}Preparing Airbyte installation...${NC}" +echo "Configuration:" +echo " - Version: Latest stable (determined by abctl)" +echo " - Port: ${AIRBYTE_PORT}" +echo " - Domain: ${AIRBYTE_HOST}" +echo " - Low Resource Mode: ${LOW_RESOURCE_MODE}" +echo " - Backup: ${ENABLE_BACKUP}" +echo "" + +INSTALL_CMD="abctl local install" + +# Add port mapping +if [ ! -z "$AIRBYTE_PORT" ] && [ "$AIRBYTE_PORT" != "8000" ]; then + INSTALL_CMD="$INSTALL_CMD --port $AIRBYTE_PORT" +fi + +# Add host if specified (for domain access) +if [ ! -z "$AIRBYTE_HOST" ]; then + INSTALL_CMD="$INSTALL_CMD --host $AIRBYTE_HOST" +fi + +# Add insecure cookies flag (required when behind nginx proxy) +if [ "$INSECURE_COOKIES" = "true" ]; then + INSTALL_CMD="$INSTALL_CMD --insecure-cookies" +fi + +# Add low resource mode (enabled by default) +if [ "$LOW_RESOURCE_MODE" = "true" ]; then + INSTALL_CMD="$INSTALL_CMD --low-resource-mode" + echo -e "${YELLOW}Note: Low-resource mode enabled. Connector Builder will be disabled.${NC}" +fi + +echo "Installation command: $INSTALL_CMD" +echo "" + +# Run installation +echo -e "${YELLOW}Installing Airbyte...${NC}" +echo "This may take up to 30 minutes depending on your internet connection." +echo "" + +eval $INSTALL_CMD + +# Check installation status +if [ $? -eq 0 ]; then + echo "" + echo -e "${GREEN}=== Airbyte Installation Complete ===${NC}" + echo "" + + # Setup backup if enabled + if [ "$ENABLE_BACKUP" = "true" ]; then + echo -e "${YELLOW}Setting up automated backups...${NC}" + cat > "$SCRIPT_DIR/backup-airbyte.sh" << 'BACKUP_SCRIPT' +#!/bin/bash +# Airbyte Backup Script +# Backs up Airbyte databases from PostgreSQL + +BACKUP_DIR="./backups" +mkdir -p "$BACKUP_DIR" + +DATE=$(date +%Y%m%d_%H%M%S) + +echo "Backing up Airbyte databases..." +docker exec postgres pg_dump -U postgres airbyte > "$BACKUP_DIR/airbyte_$DATE.sql" +docker exec postgres pg_dump -U postgres temporal > "$BACKUP_DIR/temporal_$DATE.sql" +docker exec postgres pg_dump -U postgres temporal_visibility > "$BACKUP_DIR/temporal_visibility_$DATE.sql" + +# Compress backups +tar -czf "$BACKUP_DIR/airbyte_backup_$DATE.tar.gz" "$BACKUP_DIR/airbyte_$DATE.sql" "$BACKUP_DIR/temporal_$DATE.sql" "$BACKUP_DIR/temporal_visibility_$DATE.sql" +rm "$BACKUP_DIR/airbyte_$DATE.sql" "$BACKUP_DIR/temporal_$DATE.sql" "$BACKUP_DIR/temporal_visibility_$DATE.sql" + +# Keep only last 7 days of backups +find "$BACKUP_DIR" -name "airbyte_backup_*.tar.gz" -mtime +7 -delete + +echo "Backup completed: airbyte_backup_$DATE.tar.gz" +BACKUP_SCRIPT + chmod +x "$SCRIPT_DIR/backup-airbyte.sh" + echo -e "${GREEN}✓ Backup script created: backup-airbyte.sh${NC}" + echo " Run manually: ./backup-airbyte.sh" + echo " Schedule: Add to crontab with schedule: $BACKUP_SCHEDULE" + fi + + echo "" + echo "Access Airbyte at:" + echo " Domain: https://ai.sriphat.com/airbyte (via Nginx Proxy Manager)" + echo " Local: http://localhost:${AIRBYTE_PORT:-8000}" + echo " Direct: http://[SERVER_IP]:${AIRBYTE_PORT:-8000}" + echo "" + echo -e "${YELLOW}Important: Configure Nginx Proxy Manager${NC}" + echo "1. Go to Nginx Proxy Manager (http://localhost:8021)" + echo "2. Add Proxy Host:" + echo " - Domain: ai.sriphat.com" + echo " - Scheme: http" + echo " - Forward Hostname: airbyte-proxy" + echo " - Forward Port: 8000" + echo " - Custom Location: /airbyte" + echo "3. Enable SSL certificate" + echo "4. (Optional) Add Keycloak authentication via nginx" + echo "" + echo "Next steps:" + echo "1. Configure Nginx Proxy Manager (see above)" + echo "2. Open Airbyte in your browser" + echo "3. Enter your email and organization name" + echo "4. Configure your sources and destinations" + echo "" + echo "To manage Airbyte:" + echo " Start: ./start-airbyte.sh" + echo " Stop: ./stop-airbyte.sh" + echo " Backup: ./backup-airbyte.sh" + echo " Uninstall: ./uninstall-airbyte.sh" + echo "" +else + echo -e "${RED}Installation failed. Please check the error messages above.${NC}" + exit 1 +fi diff --git a/04-ingestion/start-airbyte.sh b/04-ingestion/start-airbyte.sh new file mode 100644 index 0000000..36ecfba --- /dev/null +++ b/04-ingestion/start-airbyte.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# Start Airbyte services +# This script starts the Airbyte deployment using abctl + +set -e + +# Colors for output +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +echo -e "${YELLOW}Starting Airbyte...${NC}" + +# Check if abctl is installed +if ! command -v abctl &> /dev/null; then + echo "Error: abctl is not installed" + echo "Please run ./setup-airbyte.sh first" + exit 1 +fi + +# Start Airbyte +abctl local up + +if [ $? -eq 0 ]; then + echo -e "${GREEN}Airbyte started successfully${NC}" + echo "Access at: http://localhost:8030" +else + echo "Failed to start Airbyte" + exit 1 +fi diff --git a/04-ingestion/stop-airbyte.sh b/04-ingestion/stop-airbyte.sh new file mode 100644 index 0000000..621cd8b --- /dev/null +++ b/04-ingestion/stop-airbyte.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Stop Airbyte services +# This script stops the Airbyte deployment using abctl + +set -e + +# Colors for output +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +echo -e "${YELLOW}Stopping Airbyte...${NC}" + +# Check if abctl is installed +if ! command -v abctl &> /dev/null; then + echo "Error: abctl is not installed" + exit 1 +fi + +# Stop Airbyte +abctl local down + +if [ $? -eq 0 ]; then + echo -e "${GREEN}Airbyte stopped successfully${NC}" +else + echo "Failed to stop Airbyte" + exit 1 +fi diff --git a/04-ingestion/uninstall-airbyte.sh b/04-ingestion/uninstall-airbyte.sh new file mode 100644 index 0000000..a8493f6 --- /dev/null +++ b/04-ingestion/uninstall-airbyte.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Uninstall Airbyte +# This script completely removes the Airbyte deployment + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +echo -e "${RED}=== Airbyte Uninstall ===${NC}" +echo "This will completely remove Airbyte and all its data." +echo "" +read -p "Are you sure you want to continue? (yes/no): " -r +echo + +if [[ ! $REPLY =~ ^[Yy][Ee][Ss]$ ]]; then + echo "Uninstall cancelled" + exit 0 +fi + +# Check if abctl is installed +if ! command -v abctl &> /dev/null; then + echo -e "${YELLOW}abctl is not installed. Skipping abctl uninstall.${NC}" +else + echo -e "${YELLOW}Uninstalling Airbyte...${NC}" + abctl local uninstall +fi + +# Remove local data directories +echo -e "${YELLOW}Removing local data directories...${NC}" +if [ -d "./data" ]; then + rm -rf ./data + echo "Removed ./data" +fi + +# Ask about database cleanup +echo "" +read -p "Do you want to drop Airbyte databases from PostgreSQL? (yes/no): " -r +echo + +if [[ $REPLY =~ ^[Yy][Ee][Ss]$ ]]; then + echo -e "${YELLOW}Dropping databases...${NC}" + docker exec postgres psql -U postgres -c "DROP DATABASE IF EXISTS airbyte;" 2>/dev/null || true + docker exec postgres psql -U postgres -c "DROP DATABASE IF EXISTS temporal;" 2>/dev/null || true + docker exec postgres psql -U postgres -c "DROP DATABASE IF EXISTS temporal_visibility;" 2>/dev/null || true + echo -e "${GREEN}Databases dropped${NC}" +fi + +echo "" +echo -e "${GREEN}Airbyte uninstall complete${NC}" diff --git a/06-analytics/Dockerfile b/06-analytics/Dockerfile new file mode 100644 index 0000000..7896272 --- /dev/null +++ b/06-analytics/Dockerfile @@ -0,0 +1,15 @@ +FROM apache/superset:latest + +# เปลี่ยนเป็น root เพื่อติดตั้ง package +USER root + +# ติดตั้ง PostgreSQL driver +ENV PATH="/app/.venv/bin:$PATH" + +RUN python -m ensurepip --upgrade && \ +/app/.venv/bin/python -m pip install --upgrade pip setuptools wheel + +RUN /app/.venv/bin/pip install --no-cache-dir psycopg2-binary + +# กลับมาใช้ user superset เพื่อความปลอดภัย +USER superset diff --git a/06-analytics/docker-compose.yml b/06-analytics/docker-compose.yml index 42780e5..987dea9 100644 --- a/06-analytics/docker-compose.yml +++ b/06-analytics/docker-compose.yml @@ -1,9 +1,12 @@ services: superset: - image: apache/superset:latest + #image: apache/superset:latest + build: + context: . + dockerfile: Dockerfile container_name: superset env_file: - - ../.env.global + - ../.env environment: - SUPERSET_SECRET_KEY=${SUPERSET_SECRET_KEY} - DATABASE_DIALECT=postgresql @@ -14,6 +17,10 @@ services: - DATABASE_PASSWORD=${DB_PASSWORD} - SUPERSET_LOAD_EXAMPLES=no - TZ=${TZ:-Asia/Bangkok} + - SUPERSET_BIND_ADDRESS=0.0.0.0 + - SUPERSET_PORT=8088 + ports: + - "8088:8088" volumes: - ./data/superset_home:/app/superset_home - ./superset_config.py:/app/pythonpath/superset_config.py diff --git a/06-analytics/superset_config.py b/06-analytics/superset_config.py index b996126..e003e3f 100644 --- a/06-analytics/superset_config.py +++ b/06-analytics/superset_config.py @@ -1,7 +1,7 @@ import os SECRET_KEY = os.environ.get('SUPERSET_SECRET_KEY') -SQLALCHEMY_DATABASE_URI = f"postgresql://{os.environ.get('DATABASE_USER')}:{os.environ.get('DATABASE_PASSWORD')}@{os.environ.get('DATABASE_HOST')}:{os.environ.get('DATABASE_PORT')}/{os.environ.get('DATABASE_DB')}" +SQLALCHEMY_DATABASE_URI = f"postgresql+psycopg2://{os.environ.get('DATABASE_USER')}:{os.environ.get('DATABASE_PASSWORD')}@{os.environ.get('DATABASE_HOST')}:{os.environ.get('DATABASE_PORT')}/{os.environ.get('DATABASE_DB')}" ENABLE_PROXY_FIX = True PUBLIC_ROLE_LIKE = "Gamma"