dhamsey3 · dhamsey3 · May 13, 2026 · May 13, 2026
diff --git a/.env.example b/.env.example
@@ -0,0 +1,15 @@
+ENVIRONMENT=local
+APP_DEBUG=true
+DATABASE_URL=sqlite:///./idp.db
+REDIS_URL=redis://localhost:6379/0
+SECRET_KEY=replace-with-a-long-random-secret
+JWT_ALGORITHM=HS256
+ACCESS_TOKEN_EXPIRE_MINUTES=60
+AWS_REGION=us-east-1
+DEFAULT_INGRESS_DOMAIN=apps.local
+KUBERNETES_NAMESPACE_PREFIX=tenant
+KUBERNETES_DRY_RUN=true
+TERRAFORM_DRY_RUN=true
+TERRAFORM_STATE_BUCKET=replace-me-terraform-state
+TERRAFORM_LOCK_TABLE=replace-me-terraform-locks
+RATE_LIMIT_REQUESTS_PER_HOUR=100
diff --git a/.github/workflows/ci-cd.yaml b/.github/workflows/ci-cd.yaml
@@ -0,0 +1,41 @@
+name: CI/CD Pipeline
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    env:
+      DATABASE_URL: sqlite:///./test-idp.db
+      SECRET_KEY: ci-test-secret
+      KUBERNETES_DRY_RUN: "true"
+      TERRAFORM_DRY_RUN: "true"
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Lint
+        run: |
+          pip install flake8
+          flake8 app api auth database services tests --max-line-length=120
+      - name: Run tests
+        run: |
+          pytest
+      - name: Build Docker image
+        run: |
+          docker build -t idp-api:${{ github.sha }} .
+      - name: Push Docker image (optional)
+        if: github.ref == 'refs/heads/main'
+        run: echo "Configure registry login and docker push here"
+      - name: Deploy to Kubernetes (optional)
+        run: echo "Deploy with Helm or ArgoCD after registry configuration"
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+.venv/
+__pycache__/
+*.py[cod]
+*.db
+.pytest_cache/
+.env
+.history/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.11-slim
+
+# Create non-root user
+RUN useradd -m appuser
+WORKDIR /app
+
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt \
+	&& rm -rf /root/.cache
+
+COPY . .
+
+# Change ownership and permissions
+RUN chown -R appuser:appuser /app
+USER appuser
+
+ENV PYTHONUNBUFFERED=1
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/README.md b/README.md
@@ -1,42 +1,225 @@
-# Single Application Cluster Setup in Kubernetes
+# Cloud Infrastructure Provisioning API
 
-This guide outlines the process for deploying a single application within a Kubernetes cluster, from cluster creation to application access.
+This project is an Internal Developer Platform API built on FastAPI, Kubernetes, Terraform, PostgreSQL/SQLite, Redis, Helm, Prometheus, Grafana, and JWT authentication. It extends the original Kubernetes manifests into a mini Heroku/Render/Railway-style platform for provisioning infrastructure and deploying containerized applications through REST APIs.
 
+## Architecture
 
-## Step 1: Write a Kubernetes Deployment YAML File
+The API receives authenticated platform requests, validates input, stores metadata in the database, and orchestrates Kubernetes or Terraform operations through service-layer modules.
 
-Define the application's deployment on Kubernetes:
+Request flow for application deployment:
 
-- Draft a YAML file (`deployment-file.yaml`) that specifies the deployment details, including the Docker image, ports, environment variables, and other configurations.
+1. User authenticates with JWT.
+2. API validates Docker image, namespace, port, replica, ingress, and autoscaling inputs.
+3. A deployment row is created in the database.
+4. Kubernetes service layer creates namespace, Deployment, Service, Ingress, and HPA.
+5. Deployment status, URL, autoscaling settings, and errors are persisted.
+6. Users query deployment status, logs, metrics, and cluster health through API endpoints.
 
-## Step 2: Create a Kubernetes Deployment
+## Project Structure
 
-Deploy the application using `kubectl`:
+```text
+app/              FastAPI app, configuration, logging
+api/              Route handlers and Pydantic schemas
+auth/             JWT, RBAC, rate limiting
+database/         SQLAlchemy models and session lifecycle
+services/         Kubernetes, Terraform, deployment, monitoring logic
+kubernetes/       Cluster RBAC and network policy examples
+terraform/        AWS Terraform templates
+helm/             Helm chart for the API itself
+monitoring/       Prometheus and Grafana examples
+scripts/          Bootstrap, migration, production checklist helpers
+tests/            Unit tests
+```
 
-- Run `kubectl apply -f <deployment-file.yaml>` to deploy the application, replacing `<deployment-file.yaml>` with the path to the YAML file.
+## API Endpoints
 
-## Step 3: Expose The Deployment as a Service
+Auth:
 
-Make the deployment accessible:
+- `POST /auth/register`
+- `POST /auth/login`
+- `GET /auth/me`
 
-- Expose the deployment by executing:
+Infrastructure:
 
-Replace `<deployment-name>` and `<container-port>` with specific details to create a LoadBalancer service for external traffic.
+- `POST /infrastructure/create`
+- `GET /infrastructure/{id}`
+- `DELETE /infrastructure/{id}`
 
-## Step 4: Verify the Deployment
+Deployments:
 
-Check the status of the deployment:
+- `POST /deployments`
+- `GET /deployments/{id}`
+- `DELETE /deployments/{id}`
 
-- Confirm the application's pod is running with `kubectl get pods`.
-- Check the service creation and the external IP assignment with `kubectl get services`.
+Kubernetes:
 
-## Step 5: Access The Application
+- `POST /namespace/create`
+- `POST /service/expose`
+- `POST /autoscaling/create`
+- `POST /kubernetes/ingress/create`
 
-Visit the deployed application:
+Monitoring:
 
-- Use a web browser to navigate to the external IP address assigned to the service and access the application.
+- `GET /cluster/health`
+- `GET /metrics`
+- `GET /logs/{pod}?namespace=default`
 
-Following these steps will get the application deployed and accessible within a Kubernetes cluster.
+Swagger/OpenAPI is available at `/docs`.
 
+## Local Development
 
-<img width="1164" alt="Screenshot 2023-04-02 at 20 16 24" src="https://user-images.githubusercontent.com/73405591/229371245-1692b2ac-d4c5-456b-b983-6fa1bfd115ab.png">
+Create an environment file:
+
+```bash
+cp .env.example .env
+```
+
+For local development without a Kubernetes cluster or Terraform credentials, keep:
+
+```bash
+KUBERNETES_DRY_RUN=true
+TERRAFORM_DRY_RUN=true
+DATABASE_URL=sqlite:///./idp.db
+```
+
+Install dependencies and run the API:
+
+```bash
+python3 -m venv .venv
+./.venv/bin/pip install -r requirements.txt
+./.venv/bin/uvicorn app.main:app --reload
+```
+
+Register and log in:
+
+```bash
+curl -X POST http://localhost:8000/auth/register \
+  -H "Content-Type: application/json" \
+  -d '{"username":"platform-user","password":"change-me-123"}'
+
+TOKEN=$(curl -s -X POST http://localhost:8000/auth/login \
+  -H "Content-Type: application/json" \
+  -d '{"username":"platform-user","password":"change-me-123"}' | jq -r .access_token)
+```
+
+Deploy an application:
+
+```bash
+curl -X POST http://localhost:8000/deployments \
+  -H "Authorization: Bearer $TOKEN" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "name": "demo-api",
+    "image": "nginx:1.25",
+    "port": 80,
+    "replicas": 2,
+    "min_replicas": 1,
+    "max_replicas": 5,
+    "cpu_threshold": 70
+  }'
+```
+
+## Terraform
+
+The infrastructure API renders [terraform/main.tf.j2](terraform/main.tf.j2) and can create or destroy AWS resources. It is dry-run by default. Before enabling real execution:
+
+- Create an encrypted S3 backend bucket.
+- Create a DynamoDB lock table.
+- Replace `TERRAFORM_STATE_BUCKET` and `TERRAFORM_LOCK_TABLE`.
+- Use IAM roles with least privilege.
+- Review generated plans before production use.
+
+Example:
+
+```bash
+curl -X POST http://localhost:8000/infrastructure/create \
+  -H "Authorization: Bearer $TOKEN" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "name": "platform-dev",
+    "cloud_provider": "aws",
+    "config": {
+      "aws_region": "us-east-1",
+      "eks_role_arn": "arn:aws:iam::123456789012:role/EKSRole"
+    }
+  }'
+```
+
+## Helm Deployment
+
+Render or install the API chart:
+
+```bash
+helm template idp-api helm/charts/idp-api \
+  --set secrets.databaseUrl='postgresql://user:pass@postgres:5432/idp' \
+  --set secrets.secretKey='replace-with-long-random-secret'
+
+helm upgrade --install idp-api helm/charts/idp-api \
+  --set image.repository=registry.example.com/idp-api \
+  --set image.tag=v1 \
+  --set secrets.databaseUrl='postgresql://user:pass@postgres:5432/idp' \
+  --set secrets.secretKey='replace-with-long-random-secret'
+```
+
+## Security
+
+Implemented:
+
+- JWT authentication
+- Role-aware user model
+- Protected infrastructure, deployment, Kubernetes, and monitoring APIs
+- Redis-backed rate limiting with local fallback
+- Non-root Docker container
+- Kubernetes RBAC and network-policy examples
+- No hardcoded production secret requirement in Helm
+
+Recommended before production:
+
+- Use AWS Secrets Manager, External Secrets Operator, or sealed-secrets.
+- Replace SQLite with managed PostgreSQL.
+- Use Alembic migrations.
+- Run Terraform through a job queue or workflow engine rather than synchronous HTTP requests.
+- Enforce tenant-aware namespace ownership.
+- Add admission policies with Kyverno or OPA Gatekeeper.
+- Use image allowlists and vulnerability scanning.
+- Require immutable image digests for production deployments.
+
+## Observability
+
+The API exposes Prometheus metrics at `/metrics`. Example scrape configuration and a Grafana dashboard starter live in `monitoring/`.
+
+Recommended production stack:
+
+- Prometheus Operator
+- Grafana dashboards for API latency, error rate, Kubernetes deployment state, and Terraform failures
+- Loki or OpenSearch for structured logs
+- Alertmanager alerts for failed provisions, high error rate, and unhealthy clusters
+
+## CI/CD
+
+The GitHub Actions workflow installs dependencies, runs linting/tests, and builds the Docker image. Registry push and Kubernetes deployment are intentionally placeholders until registry, cluster, and secret strategy are configured.
+
+## Implementation Phases
+
+Phase 1: Architecture and folder structure are represented by the layered app layout.
+
+Phase 2: FastAPI backend includes auth, validation, database models, OpenAPI, health checks, and rate limiting.
+
+Phase 3: Kubernetes integration creates namespaces, deployments, services, ingress, HPA, status, logs, and safe deletes.
+
+Phase 4: Terraform automation renders AWS templates and supports apply/destroy with remote-state configuration.
+
+Phase 5: Monitoring exposes Prometheus metrics, cluster health, pod logs, and dashboard examples.
+
+Phase 6: CI/CD builds, lints, tests, and prepares image/deployment stages.
+
+Phase 7: Production hardening is documented in `scripts/prod_checklist.md` and should be completed before real cloud use.
+
+## Scaling Recommendations
+
+- Move long-running deploy/provision tasks to Celery, RQ, Temporal, or Argo Workflows.
+- Add per-tenant quotas for namespaces, replicas, CPU, memory, and load balancers.
+- Use GitOps with ArgoCD for reconciliation and auditability.
+- Split API, worker, scheduler, and webhook receiver into separate deployments.
+- Use PostgreSQL row-level ownership checks and explicit tenant IDs.
+- Add blue/green and canary deployment strategies with Argo Rollouts or Flagger.
diff --git a/api/routes/auth.py b/api/routes/auth.py
@@ -0,0 +1,33 @@
+from fastapi import APIRouter, HTTPException, Depends
+from sqlalchemy.orm import Session
+from database.models import User
+from auth.jwt_utils import get_password_hash, verify_password, create_access_token
+from auth.rbac import get_current_user
+from database.session import get_db
+from api.schemas import LoginRequest, RegisterRequest, TokenResponse
+
+router = APIRouter()
+
+@router.post("/register")
+def register(request: RegisterRequest, db: Session = Depends(get_db)):
+    user = db.query(User).filter(User.username == request.username).first()
+    if user:
+        raise HTTPException(status_code=400, detail="Username already registered")
+    hashed_pw = get_password_hash(request.password)
+    new_user = User(username=request.username, hashed_password=hashed_pw)
+    db.add(new_user)
+    db.commit()
+    db.refresh(new_user)
+    return {"id": new_user.id, "username": new_user.username, "role": new_user.role}
+
+@router.post("/login", response_model=TokenResponse)
+def login(request: LoginRequest, db: Session = Depends(get_db)):
+    user = db.query(User).filter(User.username == request.username).first()
+    if not user or not verify_password(request.password, user.hashed_password):
+        raise HTTPException(status_code=401, detail="Invalid credentials")
+    token = create_access_token({"sub": str(user.id), "role": user.role})
+    return {"access_token": token, "token_type": "bearer"}
+
+@router.get("/me")
+def get_me(current_user=Depends(get_current_user)):
+    return {"id": current_user.id, "username": current_user.username, "role": current_user.role}