feat: initial Claude Code configuration scaffold

Comprehensive Claude Code guidance system with:

- 5 agents: tdd-guardian, code-reviewer, security-scanner, refactor-scan, dependency-audit
- 18 skills covering languages (Python, TypeScript, Rust, Go, Java, C#),
  infrastructure (AWS, Azure, GCP, Terraform, Ansible, Docker/K8s, Database, CI/CD),
  testing (TDD, UI, Browser), and patterns (Monorepo, API Design, Observability)
- 3 hooks: secret detection, auto-formatting, TDD git pre-commit
- Strict TDD enforcement with 80%+ coverage requirements
- Multi-model strategy: Opus for planning, Sonnet for execution (opusplan)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-20 15:47:34 -05:00
commit befb8fbaeb
34 changed files with 12233 additions and 0 deletions

View File

@@ -0,0 +1,462 @@
---
name: api-design
description: REST API design patterns with Pydantic/Zod schemas, error handling, and OpenAPI documentation. Use when designing or implementing API endpoints.
---
# API Design Skill
## Schema-First Development
Always define schemas before implementation. Schemas serve as:
- Runtime validation
- Type definitions
- API documentation
- Test data factories
### Python (Pydantic)
```python
# schemas/user.py
from datetime import datetime
from pydantic import BaseModel, EmailStr, Field, field_validator
class UserBase(BaseModel):
"""Shared fields for user schemas."""
email: EmailStr
name: str = Field(..., min_length=1, max_length=100)
class UserCreate(UserBase):
"""Request schema for creating a user."""
password: str = Field(..., min_length=8)
@field_validator("password")
@classmethod
def password_strength(cls, v: str) -> str:
if not any(c.isupper() for c in v):
raise ValueError("Password must contain uppercase")
if not any(c.isdigit() for c in v):
raise ValueError("Password must contain digit")
return v
class UserUpdate(BaseModel):
"""Request schema for updating a user (all optional)."""
email: EmailStr | None = None
name: str | None = Field(None, min_length=1, max_length=100)
class UserResponse(UserBase):
"""Response schema (no password)."""
id: str
is_active: bool
created_at: datetime
model_config = {"from_attributes": True}
class UserListResponse(BaseModel):
"""Paginated list response."""
items: list[UserResponse]
total: int
page: int
page_size: int
has_more: bool
```
### TypeScript (Zod)
```typescript
// schemas/user.schema.ts
import { z } from 'zod';
export const userBaseSchema = z.object({
email: z.string().email(),
name: z.string().min(1).max(100),
});
export const userCreateSchema = userBaseSchema.extend({
password: z
.string()
.min(8)
.refine((p) => /[A-Z]/.test(p), 'Must contain uppercase')
.refine((p) => /\d/.test(p), 'Must contain digit'),
});
export const userUpdateSchema = userBaseSchema.partial();
export const userResponseSchema = userBaseSchema.extend({
id: z.string().uuid(),
isActive: z.boolean(),
createdAt: z.string().datetime(),
});
export const userListResponseSchema = z.object({
items: z.array(userResponseSchema),
total: z.number().int().nonnegative(),
page: z.number().int().positive(),
pageSize: z.number().int().positive(),
hasMore: z.boolean(),
});
// Derived types
export type UserCreate = z.infer<typeof userCreateSchema>;
export type UserUpdate = z.infer<typeof userUpdateSchema>;
export type UserResponse = z.infer<typeof userResponseSchema>;
export type UserListResponse = z.infer<typeof userListResponseSchema>;
// Validation functions for API boundaries
export const parseUserCreate = (data: unknown) => userCreateSchema.parse(data);
export const parseUserResponse = (data: unknown) => userResponseSchema.parse(data);
```
## REST Endpoint Patterns
### Resource Naming
```
GET /users # List users
POST /users # Create user
GET /users/{id} # Get single user
PUT /users/{id} # Full update
PATCH /users/{id} # Partial update
DELETE /users/{id} # Delete user
# Nested resources
GET /users/{id}/orders # User's orders
POST /users/{id}/orders # Create order for user
# Actions (when CRUD doesn't fit)
POST /users/{id}/activate
POST /orders/{id}/cancel
```
### FastAPI Implementation
```python
# routers/users.py
from fastapi import APIRouter, Depends, HTTPException, Query, status
from sqlalchemy.ext.asyncio import AsyncSession
from app.schemas.user import (
UserCreate,
UserUpdate,
UserResponse,
UserListResponse,
)
from app.services.user import UserService
from app.dependencies import get_db, get_current_user
router = APIRouter(prefix="/users", tags=["users"])
@router.get("", response_model=UserListResponse)
async def list_users(
page: int = Query(1, ge=1),
page_size: int = Query(20, ge=1, le=100),
db: AsyncSession = Depends(get_db),
) -> UserListResponse:
"""List users with pagination."""
service = UserService(db)
return await service.list_users(page=page, page_size=page_size)
@router.post("", response_model=UserResponse, status_code=status.HTTP_201_CREATED)
async def create_user(
data: UserCreate,
db: AsyncSession = Depends(get_db),
) -> UserResponse:
"""Create a new user."""
service = UserService(db)
try:
return await service.create_user(data)
except ValueError as e:
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
@router.get("/{user_id}", response_model=UserResponse)
async def get_user(
user_id: str,
db: AsyncSession = Depends(get_db),
) -> UserResponse:
"""Get a user by ID."""
service = UserService(db)
user = await service.get_user(user_id)
if not user:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="User not found")
return user
@router.patch("/{user_id}", response_model=UserResponse)
async def update_user(
user_id: str,
data: UserUpdate,
db: AsyncSession = Depends(get_db),
current_user: UserResponse = Depends(get_current_user),
) -> UserResponse:
"""Partially update a user."""
service = UserService(db)
user = await service.update_user(user_id, data)
if not user:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="User not found")
return user
@router.delete("/{user_id}", status_code=status.HTTP_204_NO_CONTENT)
async def delete_user(
user_id: str,
db: AsyncSession = Depends(get_db),
current_user: UserResponse = Depends(get_current_user),
) -> None:
"""Delete a user."""
service = UserService(db)
deleted = await service.delete_user(user_id)
if not deleted:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="User not found")
```
## Error Handling
### Standard Error Response (RFC 7807)
```python
# schemas/error.py
from pydantic import BaseModel
class ErrorDetail(BaseModel):
"""Standard error response following RFC 7807."""
type: str = "about:blank"
title: str
status: int
detail: str
instance: str | None = None
# Exception handler
from fastapi import Request
from fastapi.responses import JSONResponse
async def validation_exception_handler(request: Request, exc: RequestValidationError):
return JSONResponse(
status_code=422,
content=ErrorDetail(
type="validation_error",
title="Validation Error",
status=422,
detail=str(exc.errors()),
instance=str(request.url),
).model_dump(),
)
```
### TypeScript Error Handling
```typescript
// lib/api-client.ts
import axios, { AxiosError } from 'axios';
import { z } from 'zod';
const errorSchema = z.object({
type: z.string(),
title: z.string(),
status: z.number(),
detail: z.string(),
instance: z.string().optional(),
});
export class ApiError extends Error {
constructor(
public status: number,
public title: string,
public detail: string,
) {
super(detail);
this.name = 'ApiError';
}
}
export const apiClient = axios.create({
baseURL: '/api',
headers: { 'Content-Type': 'application/json' },
});
apiClient.interceptors.response.use(
(response) => response,
(error: AxiosError) => {
if (error.response?.data) {
const parsed = errorSchema.safeParse(error.response.data);
if (parsed.success) {
throw new ApiError(
parsed.data.status,
parsed.data.title,
parsed.data.detail,
);
}
}
throw new ApiError(500, 'Server Error', 'An unexpected error occurred');
},
);
```
## Pagination Pattern
```python
# schemas/pagination.py
from typing import Generic, TypeVar
from pydantic import BaseModel, Field
T = TypeVar("T")
class PaginatedResponse(BaseModel, Generic[T]):
"""Generic paginated response."""
items: list[T]
total: int
page: int = Field(ge=1)
page_size: int = Field(ge=1, le=100)
@property
def has_more(self) -> bool:
return self.page * self.page_size < self.total
@property
def total_pages(self) -> int:
return (self.total + self.page_size - 1) // self.page_size
# Usage
class UserListResponse(PaginatedResponse[UserResponse]):
pass
```
## Query Parameters
```python
# dependencies/pagination.py
from fastapi import Query
from pydantic import BaseModel
class PaginationParams(BaseModel):
page: int = Query(1, ge=1, description="Page number")
page_size: int = Query(20, ge=1, le=100, description="Items per page")
@property
def offset(self) -> int:
return (self.page - 1) * self.page_size
class SortParams(BaseModel):
sort_by: str = Query("created_at", description="Field to sort by")
sort_order: str = Query("desc", pattern="^(asc|desc)$")
class FilterParams(BaseModel):
search: str | None = Query(None, min_length=1, max_length=100)
status: str | None = Query(None, pattern="^(active|inactive|pending)$")
created_after: datetime | None = Query(None)
created_before: datetime | None = Query(None)
```
## OpenAPI Documentation
```python
# main.py
from fastapi import FastAPI
from fastapi.openapi.utils import get_openapi
app = FastAPI(
title="My API",
description="API for managing resources",
version="1.0.0",
docs_url="/docs",
redoc_url="/redoc",
)
def custom_openapi():
if app.openapi_schema:
return app.openapi_schema
openapi_schema = get_openapi(
title=app.title,
version=app.version,
description=app.description,
routes=app.routes,
)
# Add security scheme
openapi_schema["components"]["securitySchemes"] = {
"bearerAuth": {
"type": "http",
"scheme": "bearer",
"bearerFormat": "JWT",
}
}
app.openapi_schema = openapi_schema
return app.openapi_schema
app.openapi = custom_openapi
```
## HTTP Status Codes
| Code | Meaning | When to Use |
|------|---------|-------------|
| 200 | OK | Successful GET, PUT, PATCH |
| 201 | Created | Successful POST creating resource |
| 204 | No Content | Successful DELETE |
| 400 | Bad Request | Invalid request body/params |
| 401 | Unauthorized | Missing/invalid authentication |
| 403 | Forbidden | Authenticated but not authorized |
| 404 | Not Found | Resource doesn't exist |
| 409 | Conflict | Duplicate resource (e.g., email exists) |
| 422 | Unprocessable | Validation error |
| 500 | Server Error | Unexpected server error |
## Anti-Patterns
```python
# BAD: Returning different shapes
@router.get("/users/{id}")
async def get_user(id: str):
user = await get_user(id)
if user:
return user # UserResponse
return {"error": "not found"} # Different shape!
# GOOD: Consistent response or exception
@router.get("/users/{id}", response_model=UserResponse)
async def get_user(id: str):
user = await get_user(id)
if not user:
raise HTTPException(status_code=404, detail="User not found")
return user
# BAD: Exposing internal details
class UserResponse(BaseModel):
id: str
email: str
hashed_password: str # NEVER expose!
internal_notes: str # Internal only!
# GOOD: Explicit public fields
class UserResponse(BaseModel):
id: str
email: str
name: str
# Only fields clients need
# BAD: No validation at boundary
@router.post("/users")
async def create_user(data: dict): # Unvalidated!
return await service.create(data)
# GOOD: Schema validation
@router.post("/users", response_model=UserResponse)
async def create_user(data: UserCreate): # Validated!
return await service.create(data)
```

View File

@@ -0,0 +1,404 @@
---
name: monorepo-patterns
description: Monorepo workspace patterns for multi-package projects with shared dependencies, testing strategies, and CI/CD. Use when working in monorepo structures.
---
# Monorepo Patterns Skill
## Recommended Structure
```
project/
├── apps/
│ ├── backend/ # Python FastAPI
│ │ ├── src/
│ │ ├── tests/
│ │ └── pyproject.toml
│ └── frontend/ # React TypeScript
│ ├── src/
│ ├── tests/
│ └── package.json
├── packages/
│ ├── shared-types/ # Shared TypeScript types
│ │ ├── src/
│ │ └── package.json
│ └── ui-components/ # Shared React components
│ ├── src/
│ └── package.json
├── infrastructure/
│ ├── terraform/
│ │ ├── environments/
│ │ └── modules/
│ └── ansible/
│ ├── playbooks/
│ └── roles/
├── scripts/ # Shared scripts
├── docs/ # Documentation
├── .github/
│ └── workflows/
├── package.json # Root (workspaces config)
├── pyproject.toml # Python workspace config
└── CLAUDE.md # Project-level guidance
```
## Workspace Configuration
### npm Workspaces (Node.js)
```json
// package.json (root)
{
"name": "my-monorepo",
"private": true,
"workspaces": [
"apps/*",
"packages/*"
],
"scripts": {
"dev": "npm run dev --workspaces --if-present",
"build": "npm run build --workspaces --if-present",
"test": "npm run test --workspaces --if-present",
"lint": "npm run lint --workspaces --if-present",
"typecheck": "npm run typecheck --workspaces --if-present"
},
"devDependencies": {
"typescript": "^5.6.0",
"vitest": "^3.2.0",
"@types/node": "^22.0.0"
}
}
```
### UV Workspace (Python)
```toml
# pyproject.toml (root)
[project]
name = "my-monorepo"
version = "0.0.0"
requires-python = ">=3.11"
[tool.uv.workspace]
members = ["apps/*", "packages/*"]
[tool.uv.sources]
shared-utils = { workspace = true }
```
## Package References
### TypeScript Internal Packages
```json
// packages/shared-types/package.json
{
"name": "@myorg/shared-types",
"version": "0.0.0",
"private": true,
"main": "./dist/index.js",
"types": "./dist/index.d.ts",
"exports": {
".": {
"types": "./dist/index.d.ts",
"import": "./dist/index.js"
}
},
"scripts": {
"build": "tsc",
"dev": "tsc --watch"
}
}
// apps/frontend/package.json
{
"name": "@myorg/frontend",
"dependencies": {
"@myorg/shared-types": "workspace:*"
}
}
```
### Python Internal Packages
```toml
# packages/shared-utils/pyproject.toml
[project]
name = "shared-utils"
version = "0.1.0"
dependencies = []
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
# apps/backend/pyproject.toml
[project]
name = "backend"
dependencies = [
"shared-utils", # Resolved via workspace
]
```
## Testing Strategies
### Run All Tests
```bash
# From root
npm test # All Node packages
uv run pytest # All Python packages
# Specific workspace
npm test --workspace=@myorg/frontend
uv run pytest apps/backend/
```
### Test Dependencies Between Packages
```typescript
// packages/shared-types/src/user.ts
export type User = {
id: string;
email: string;
name: string;
};
// apps/frontend/src/features/users/types.ts
// Import from workspace package
import type { User } from '@myorg/shared-types';
export type UserListProps = {
users: User[];
onSelect: (user: User) => void;
};
```
### Integration Tests Across Packages
```typescript
// apps/frontend/tests/integration/api.test.ts
import { User } from '@myorg/shared-types';
import { renderWithProviders } from '../utils/render';
describe('Frontend-Backend Integration', () => {
it('should display user from API', async () => {
const mockUser: User = {
id: 'user-1',
email: 'test@example.com',
name: 'Test User',
};
// Mock API response with shared type
server.use(
http.get('/api/users/user-1', () => HttpResponse.json(mockUser))
);
render(<UserProfile userId="user-1" />);
await expect(screen.findByText('Test User')).resolves.toBeInTheDocument();
});
});
```
## CI/CD Patterns
### Change Detection
```yaml
# .github/workflows/ci.yml
name: CI
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
detect-changes:
runs-on: ubuntu-latest
outputs:
frontend: ${{ steps.changes.outputs.frontend }}
backend: ${{ steps.changes.outputs.backend }}
infrastructure: ${{ steps.changes.outputs.infrastructure }}
steps:
- uses: actions/checkout@v4
- uses: dorny/paths-filter@v3
id: changes
with:
filters: |
frontend:
- 'apps/frontend/**'
- 'packages/shared-types/**'
- 'packages/ui-components/**'
backend:
- 'apps/backend/**'
- 'packages/shared-utils/**'
infrastructure:
- 'infrastructure/**'
frontend:
needs: detect-changes
if: needs.detect-changes.outputs.frontend == 'true'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: '22'
cache: 'npm'
- run: npm ci
- run: npm run typecheck --workspace=@myorg/frontend
- run: npm run lint --workspace=@myorg/frontend
- run: npm run test --workspace=@myorg/frontend
backend:
needs: detect-changes
if: needs.detect-changes.outputs.backend == 'true'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v4
- run: uv sync
- run: uv run ruff check apps/backend/
- run: uv run mypy apps/backend/
- run: uv run pytest apps/backend/ --cov --cov-fail-under=80
```
### Jenkinsfile for Monorepo
```groovy
// Jenkinsfile
pipeline {
agent any
stages {
stage('Detect Changes') {
steps {
script {
def changes = sh(
script: 'git diff --name-only HEAD~1',
returnStdout: true
).trim().split('\n')
env.FRONTEND_CHANGED = changes.any { it.startsWith('apps/frontend/') || it.startsWith('packages/') }
env.BACKEND_CHANGED = changes.any { it.startsWith('apps/backend/') }
env.INFRA_CHANGED = changes.any { it.startsWith('infrastructure/') }
}
}
}
stage('Frontend') {
when {
expression { env.FRONTEND_CHANGED == 'true' }
}
steps {
dir('apps/frontend') {
sh 'npm ci'
sh 'npm run typecheck'
sh 'npm run lint'
sh 'npm run test'
}
}
}
stage('Backend') {
when {
expression { env.BACKEND_CHANGED == 'true' }
}
steps {
sh 'uv sync'
sh 'uv run ruff check apps/backend/'
sh 'uv run pytest apps/backend/ --cov --cov-fail-under=80'
}
}
stage('Infrastructure') {
when {
expression { env.INFRA_CHANGED == 'true' }
}
steps {
dir('infrastructure/terraform') {
sh 'terraform init'
sh 'terraform validate'
sh 'terraform fmt -check -recursive'
}
}
}
}
}
```
## Dependency Management
### Shared Dependencies at Root
```json
// package.json (root)
{
"devDependencies": {
// Shared dev dependencies
"typescript": "^5.6.0",
"vitest": "^3.2.0",
"eslint": "^9.0.0",
"@types/node": "^22.0.0"
}
}
```
### Package-Specific Dependencies
```json
// apps/frontend/package.json
{
"dependencies": {
// App-specific dependencies
"react": "^18.3.0",
"@tanstack/react-query": "^5.0.0"
}
}
```
## Commands Quick Reference
```bash
# Install all dependencies
npm install # Node (from root)
uv sync # Python
# Run in specific workspace
npm run dev --workspace=@myorg/frontend
npm run test --workspace=@myorg/shared-types
# Run in all workspaces
npm run build --workspaces
npm run test --workspaces --if-present
# Add dependency to specific package
npm install lodash --workspace=@myorg/frontend
uv add requests --package backend
# Add shared dependency to root
npm install -D prettier
```
## CLAUDE.md Placement
### Root CLAUDE.md (Project-Wide)
```markdown
# Project Standards
[Core standards that apply everywhere]
```
### Package-Specific CLAUDE.md
```markdown
# apps/frontend/CLAUDE.md
## Frontend-Specific Standards
- Use React Testing Library for component tests
- Prefer Radix UI primitives
- Use TanStack Query for server state
```
```markdown
# apps/backend/CLAUDE.md
## Backend-Specific Standards
- Use pytest-asyncio for async tests
- Pydantic v2 for all schemas
- SQLAlchemy 2.0 async patterns
```
Skills in `~/.claude/skills/` are automatically available across all packages in the monorepo.

View File

@@ -0,0 +1,486 @@
---
name: observability
description: Logging, metrics, and tracing patterns for application observability. Use when implementing monitoring, debugging, or production visibility.
---
# Observability Skill
## Three Pillars
1. **Logs** - Discrete events with context
2. **Metrics** - Aggregated measurements over time
3. **Traces** - Request flow across services
## Structured Logging
### Python (structlog)
```python
import structlog
from structlog.types import Processor
def configure_logging(json_output: bool = True) -> None:
"""Configure structured logging."""
processors: list[Processor] = [
structlog.contextvars.merge_contextvars,
structlog.processors.add_log_level,
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
]
if json_output:
processors.append(structlog.processors.JSONRenderer())
else:
processors.append(structlog.dev.ConsoleRenderer())
structlog.configure(
processors=processors,
wrapper_class=structlog.make_filtering_bound_logger(logging.INFO),
context_class=dict,
logger_factory=structlog.PrintLoggerFactory(),
cache_logger_on_first_use=True,
)
# Usage
logger = structlog.get_logger()
# Add context that persists across log calls
structlog.contextvars.bind_contextvars(
request_id="req-123",
user_id="user-456",
)
logger.info("order_created", order_id="order-789", total=150.00)
# {"event": "order_created", "order_id": "order-789", "total": 150.0, "request_id": "req-123", "user_id": "user-456", "level": "info", "timestamp": "2024-01-15T10:30:00Z"}
logger.error("payment_failed", order_id="order-789", error="insufficient_funds")
```
### TypeScript (pino)
```typescript
import pino from 'pino';
const logger = pino({
level: process.env.LOG_LEVEL || 'info',
formatters: {
level: (label) => ({ level: label }),
},
timestamp: pino.stdTimeFunctions.isoTime,
redact: ['password', 'token', 'authorization'],
});
// Create child logger with bound context
const requestLogger = logger.child({
requestId: 'req-123',
userId: 'user-456',
});
requestLogger.info({ orderId: 'order-789', total: 150.0 }, 'order_created');
requestLogger.error({ orderId: 'order-789', error: 'insufficient_funds' }, 'payment_failed');
// Express middleware
import { randomUUID } from 'crypto';
const loggingMiddleware = (req, res, next) => {
const requestId = req.headers['x-request-id'] || randomUUID();
req.log = logger.child({
requestId,
method: req.method,
path: req.path,
userAgent: req.headers['user-agent'],
});
const startTime = Date.now();
res.on('finish', () => {
req.log.info({
statusCode: res.statusCode,
durationMs: Date.now() - startTime,
}, 'request_completed');
});
next();
};
```
### Log Levels
| Level | When to Use |
|-------|-------------|
| `error` | Failures requiring attention |
| `warn` | Unexpected but handled situations |
| `info` | Business events (order created, user logged in) |
| `debug` | Technical details for debugging |
| `trace` | Very detailed tracing (rarely used in prod) |
## Metrics
### Python (prometheus-client)
```python
from prometheus_client import Counter, Histogram, Gauge, start_http_server
import time
# Define metrics
REQUEST_COUNT = Counter(
'http_requests_total',
'Total HTTP requests',
['method', 'endpoint', 'status']
)
REQUEST_LATENCY = Histogram(
'http_request_duration_seconds',
'HTTP request latency',
['method', 'endpoint'],
buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 5.0]
)
ACTIVE_CONNECTIONS = Gauge(
'active_connections',
'Number of active connections'
)
ORDERS_PROCESSED = Counter(
'orders_processed_total',
'Total orders processed',
['status'] # success, failed
)
# Usage
def process_request(method: str, endpoint: str):
ACTIVE_CONNECTIONS.inc()
start_time = time.time()
try:
# Process request...
REQUEST_COUNT.labels(method=method, endpoint=endpoint, status='200').inc()
except Exception:
REQUEST_COUNT.labels(method=method, endpoint=endpoint, status='500').inc()
raise
finally:
REQUEST_LATENCY.labels(method=method, endpoint=endpoint).observe(
time.time() - start_time
)
ACTIVE_CONNECTIONS.dec()
# FastAPI middleware
from fastapi import FastAPI, Request
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
from starlette.responses import Response
app = FastAPI()
@app.middleware("http")
async def metrics_middleware(request: Request, call_next):
start_time = time.time()
response = await call_next(request)
REQUEST_COUNT.labels(
method=request.method,
endpoint=request.url.path,
status=response.status_code
).inc()
REQUEST_LATENCY.labels(
method=request.method,
endpoint=request.url.path
).observe(time.time() - start_time)
return response
@app.get("/metrics")
async def metrics():
return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
```
### TypeScript (prom-client)
```typescript
import { Registry, Counter, Histogram, Gauge, collectDefaultMetrics } from 'prom-client';
const register = new Registry();
collectDefaultMetrics({ register });
const httpRequestsTotal = new Counter({
name: 'http_requests_total',
help: 'Total HTTP requests',
labelNames: ['method', 'path', 'status'],
registers: [register],
});
const httpRequestDuration = new Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request duration',
labelNames: ['method', 'path'],
buckets: [0.01, 0.05, 0.1, 0.5, 1, 5],
registers: [register],
});
// Express middleware
const metricsMiddleware = (req, res, next) => {
const end = httpRequestDuration.startTimer({ method: req.method, path: req.path });
res.on('finish', () => {
httpRequestsTotal.inc({ method: req.method, path: req.path, status: res.statusCode });
end();
});
next();
};
// Metrics endpoint
app.get('/metrics', async (req, res) => {
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
});
```
### Key Metrics (RED Method)
| Metric | Description |
|--------|-------------|
| **R**ate | Requests per second |
| **E**rrors | Error rate (%) |
| **D**uration | Latency (p50, p95, p99) |
### Key Metrics (USE Method for Resources)
| Metric | Description |
|--------|-------------|
| **U**tilization | % time resource is busy |
| **S**aturation | Queue depth, backlog |
| **E**rrors | Error count |
## Distributed Tracing
### Python (OpenTelemetry)
```python
from opentelemetry import trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.resources import Resource
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
def configure_tracing(service_name: str, otlp_endpoint: str) -> None:
"""Configure OpenTelemetry tracing."""
resource = Resource.create({"service.name": service_name})
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(OTLPSpanExporter(endpoint=otlp_endpoint))
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)
# Auto-instrument libraries
FastAPIInstrumentor.instrument()
SQLAlchemyInstrumentor().instrument()
HTTPXClientInstrumentor().instrument()
# Manual instrumentation
tracer = trace.get_tracer(__name__)
async def process_order(order_id: str) -> dict:
with tracer.start_as_current_span("process_order") as span:
span.set_attribute("order.id", order_id)
# Child span for validation
with tracer.start_as_current_span("validate_order"):
validated = await validate_order(order_id)
# Child span for payment
with tracer.start_as_current_span("process_payment") as payment_span:
payment_span.set_attribute("payment.method", "card")
result = await charge_payment(order_id)
span.set_attribute("order.status", "completed")
return result
```
### TypeScript (OpenTelemetry)
```typescript
import { NodeSDK } from '@opentelemetry/sdk-node';
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-grpc';
import { Resource } from '@opentelemetry/resources';
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
const sdk = new NodeSDK({
resource: new Resource({
[SemanticResourceAttributes.SERVICE_NAME]: 'my-service',
}),
traceExporter: new OTLPTraceExporter({
url: process.env.OTLP_ENDPOINT,
}),
instrumentations: [getNodeAutoInstrumentations()],
});
sdk.start();
// Manual instrumentation
import { trace, SpanStatusCode } from '@opentelemetry/api';
const tracer = trace.getTracer('my-service');
async function processOrder(orderId: string) {
return tracer.startActiveSpan('process_order', async (span) => {
try {
span.setAttribute('order.id', orderId);
await tracer.startActiveSpan('validate_order', async (validateSpan) => {
await validateOrder(orderId);
validateSpan.end();
});
const result = await tracer.startActiveSpan('process_payment', async (paymentSpan) => {
paymentSpan.setAttribute('payment.method', 'card');
const res = await chargePayment(orderId);
paymentSpan.end();
return res;
});
span.setStatus({ code: SpanStatusCode.OK });
return result;
} catch (error) {
span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
span.recordException(error);
throw error;
} finally {
span.end();
}
});
}
```
## Health Checks
```python
from fastapi import FastAPI, Response
from pydantic import BaseModel
from enum import Enum
class HealthStatus(str, Enum):
HEALTHY = "healthy"
DEGRADED = "degraded"
UNHEALTHY = "unhealthy"
class ComponentHealth(BaseModel):
name: str
status: HealthStatus
message: str | None = None
class HealthResponse(BaseModel):
status: HealthStatus
version: str
components: list[ComponentHealth]
async def check_database() -> ComponentHealth:
try:
await db.execute("SELECT 1")
return ComponentHealth(name="database", status=HealthStatus.HEALTHY)
except Exception as e:
return ComponentHealth(name="database", status=HealthStatus.UNHEALTHY, message=str(e))
async def check_redis() -> ComponentHealth:
try:
await redis.ping()
return ComponentHealth(name="redis", status=HealthStatus.HEALTHY)
except Exception as e:
return ComponentHealth(name="redis", status=HealthStatus.DEGRADED, message=str(e))
@app.get("/health", response_model=HealthResponse)
async def health_check(response: Response):
components = await asyncio.gather(
check_database(),
check_redis(),
)
# Overall status is worst component status
if any(c.status == HealthStatus.UNHEALTHY for c in components):
overall = HealthStatus.UNHEALTHY
response.status_code = 503
elif any(c.status == HealthStatus.DEGRADED for c in components):
overall = HealthStatus.DEGRADED
else:
overall = HealthStatus.HEALTHY
return HealthResponse(
status=overall,
version="1.0.0",
components=components,
)
@app.get("/ready")
async def readiness_check():
"""Kubernetes readiness probe - can we serve traffic?"""
# Check critical dependencies
await check_database()
return {"status": "ready"}
@app.get("/live")
async def liveness_check():
"""Kubernetes liveness probe - is the process healthy?"""
return {"status": "alive"}
```
## Alerting Rules
```yaml
# prometheus-rules.yaml
groups:
- name: application
rules:
# High error rate
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }}"
# High latency
- alert: HighLatency
expr: |
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High latency detected"
description: "p95 latency is {{ $value }}s"
# Service down
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service is down"
```
## Best Practices
### Logging
- Use structured JSON logs
- Include correlation/request IDs
- Redact sensitive data
- Use appropriate log levels
- Don't log in hot paths (use sampling)
### Metrics
- Use consistent naming conventions
- Keep cardinality under control
- Use histograms for latency (not averages)
- Export business metrics alongside technical ones
### Tracing
- Instrument at service boundaries
- Propagate context across services
- Sample appropriately in production
- Add relevant attributes to spans