chore: initialize repository with deployment baseline
This commit is contained in:
391
backend/src/routes/monitoring.routes.ts
Normal file
391
backend/src/routes/monitoring.routes.ts
Normal file
@@ -0,0 +1,391 @@
|
||||
import {
|
||||
AlertChannel,
|
||||
HealthCheckTargetType,
|
||||
HealthCheckType,
|
||||
MonitoringAlertStatus,
|
||||
Severity
|
||||
} from "@prisma/client";
|
||||
import { Router } from "express";
|
||||
import { z } from "zod";
|
||||
import { HttpError } from "../lib/http-error";
|
||||
import { prisma } from "../lib/prisma";
|
||||
import { toPrismaJsonValue } from "../lib/prisma-json";
|
||||
import { authorize, isTenantScopedUser, requireAuth } from "../middleware/auth";
|
||||
import { logAudit } from "../services/audit.service";
|
||||
import {
|
||||
clusterResourceForecast,
|
||||
createAlertRule,
|
||||
createHealthCheckDefinition,
|
||||
evaluateAlertRulesNow,
|
||||
faultyDeploymentInsights,
|
||||
listAlertEvents,
|
||||
listAlertNotifications,
|
||||
listAlertRules,
|
||||
listHealthCheckResults,
|
||||
listHealthChecks,
|
||||
monitoringOverview,
|
||||
runHealthCheckNow,
|
||||
updateAlertRule,
|
||||
updateHealthCheckDefinition
|
||||
} from "../services/monitoring.service";
|
||||
|
||||
const router = Router();
|
||||
|
||||
const healthCheckSchema = z.object({
|
||||
name: z.string().min(2),
|
||||
description: z.string().optional(),
|
||||
target_type: z.nativeEnum(HealthCheckTargetType),
|
||||
check_type: z.nativeEnum(HealthCheckType).optional(),
|
||||
tenant_id: z.string().optional(),
|
||||
vm_id: z.string().optional(),
|
||||
node_id: z.string().optional(),
|
||||
cpu_warn_pct: z.number().min(0).max(100).optional(),
|
||||
cpu_critical_pct: z.number().min(0).max(100).optional(),
|
||||
ram_warn_pct: z.number().min(0).max(100).optional(),
|
||||
ram_critical_pct: z.number().min(0).max(100).optional(),
|
||||
disk_warn_pct: z.number().min(0).max(100).optional(),
|
||||
disk_critical_pct: z.number().min(0).max(100).optional(),
|
||||
disk_io_read_warn: z.number().min(0).optional(),
|
||||
disk_io_read_critical: z.number().min(0).optional(),
|
||||
disk_io_write_warn: z.number().min(0).optional(),
|
||||
disk_io_write_critical: z.number().min(0).optional(),
|
||||
network_in_warn: z.number().min(0).optional(),
|
||||
network_in_critical: z.number().min(0).optional(),
|
||||
network_out_warn: z.number().min(0).optional(),
|
||||
network_out_critical: z.number().min(0).optional(),
|
||||
latency_warn_ms: z.number().int().min(1).optional(),
|
||||
latency_critical_ms: z.number().int().min(1).optional(),
|
||||
schedule_minutes: z.number().int().min(1).max(1440).optional(),
|
||||
enabled: z.boolean().optional(),
|
||||
metadata: z.record(z.unknown()).optional()
|
||||
});
|
||||
|
||||
const alertRuleSchema = z.object({
|
||||
name: z.string().min(2),
|
||||
description: z.string().optional(),
|
||||
tenant_id: z.string().optional(),
|
||||
vm_id: z.string().optional(),
|
||||
node_id: z.string().optional(),
|
||||
cpu_threshold_pct: z.number().min(0).max(100).optional(),
|
||||
ram_threshold_pct: z.number().min(0).max(100).optional(),
|
||||
disk_threshold_pct: z.number().min(0).max(100).optional(),
|
||||
disk_io_read_threshold: z.number().min(0).optional(),
|
||||
disk_io_write_threshold: z.number().min(0).optional(),
|
||||
network_in_threshold: z.number().min(0).optional(),
|
||||
network_out_threshold: z.number().min(0).optional(),
|
||||
consecutive_breaches: z.number().int().min(1).max(20).optional(),
|
||||
evaluation_window_minutes: z.number().int().min(1).max(1440).optional(),
|
||||
severity: z.nativeEnum(Severity).optional(),
|
||||
channels: z.array(z.nativeEnum(AlertChannel)).optional(),
|
||||
enabled: z.boolean().optional(),
|
||||
metadata: z.record(z.unknown()).optional()
|
||||
});
|
||||
|
||||
async function ensureVmTenantScope(vmId: string, req: Pick<Express.Request, "user">) {
|
||||
const vm = await prisma.virtualMachine.findUnique({
|
||||
where: { id: vmId },
|
||||
select: {
|
||||
id: true,
|
||||
tenant_id: true,
|
||||
name: true
|
||||
}
|
||||
});
|
||||
|
||||
if (!vm) {
|
||||
throw new HttpError(404, "VM not found", "VM_NOT_FOUND");
|
||||
}
|
||||
|
||||
if (isTenantScopedUser(req) && req.user?.tenant_id && vm.tenant_id !== req.user.tenant_id) {
|
||||
throw new HttpError(403, "Access denied for tenant scope", "TENANT_SCOPE_VIOLATION");
|
||||
}
|
||||
|
||||
return vm;
|
||||
}
|
||||
|
||||
function scopedTenantId(req: Pick<Express.Request, "user">) {
|
||||
return isTenantScopedUser(req) ? req.user?.tenant_id ?? undefined : undefined;
|
||||
}
|
||||
|
||||
function queryTenantId(req: { query?: Record<string, unknown> }) {
|
||||
return typeof req.query?.tenant_id === "string" ? req.query.tenant_id : undefined;
|
||||
}
|
||||
|
||||
router.get("/overview", requireAuth, authorize("security:read"), async (req, res, next) => {
|
||||
try {
|
||||
const data = await monitoringOverview({
|
||||
tenant_id: scopedTenantId(req)
|
||||
});
|
||||
return res.json(data);
|
||||
} catch (error) {
|
||||
return next(error);
|
||||
}
|
||||
});
|
||||
|
||||
router.get("/health-checks", requireAuth, authorize("security:read"), async (req, res, next) => {
|
||||
try {
|
||||
const data = await listHealthChecks({
|
||||
tenant_id: scopedTenantId(req) ?? queryTenantId(req),
|
||||
enabled: typeof req.query.enabled === "string" ? req.query.enabled === "true" : undefined
|
||||
});
|
||||
return res.json({ data });
|
||||
} catch (error) {
|
||||
return next(error);
|
||||
}
|
||||
});
|
||||
|
||||
router.post("/health-checks", requireAuth, authorize("security:manage"), async (req, res, next) => {
|
||||
try {
|
||||
const payload = healthCheckSchema.parse(req.body ?? {});
|
||||
|
||||
if (payload.vm_id) {
|
||||
await ensureVmTenantScope(payload.vm_id, req);
|
||||
}
|
||||
|
||||
const tenantId = scopedTenantId(req) ?? payload.tenant_id;
|
||||
const check = await createHealthCheckDefinition({
|
||||
...payload,
|
||||
tenant_id: tenantId,
|
||||
created_by: req.user?.email
|
||||
});
|
||||
|
||||
await logAudit({
|
||||
action: "monitoring.health_check.create",
|
||||
resource_type: "SECURITY",
|
||||
resource_id: check.id,
|
||||
resource_name: check.name,
|
||||
actor_email: req.user!.email,
|
||||
actor_role: req.user!.role,
|
||||
details: toPrismaJsonValue(payload),
|
||||
ip_address: req.ip
|
||||
});
|
||||
|
||||
return res.status(201).json(check);
|
||||
} catch (error) {
|
||||
return next(error);
|
||||
}
|
||||
});
|
||||
|
||||
router.patch("/health-checks/:id", requireAuth, authorize("security:manage"), async (req, res, next) => {
|
||||
try {
|
||||
const payload = healthCheckSchema.partial().parse(req.body ?? {});
|
||||
const existing = await prisma.serverHealthCheck.findUnique({
|
||||
where: { id: req.params.id },
|
||||
select: {
|
||||
id: true,
|
||||
tenant_id: true
|
||||
}
|
||||
});
|
||||
|
||||
if (!existing) {
|
||||
throw new HttpError(404, "Health check not found", "HEALTH_CHECK_NOT_FOUND");
|
||||
}
|
||||
|
||||
if (isTenantScopedUser(req) && req.user?.tenant_id && existing.tenant_id && existing.tenant_id !== req.user.tenant_id) {
|
||||
throw new HttpError(403, "Access denied for tenant scope", "TENANT_SCOPE_VIOLATION");
|
||||
}
|
||||
|
||||
if (payload.vm_id) {
|
||||
await ensureVmTenantScope(payload.vm_id, req);
|
||||
}
|
||||
|
||||
const updated = await updateHealthCheckDefinition(req.params.id, {
|
||||
...payload,
|
||||
tenant_id: scopedTenantId(req) ?? payload.tenant_id
|
||||
});
|
||||
|
||||
return res.json(updated);
|
||||
} catch (error) {
|
||||
return next(error);
|
||||
}
|
||||
});
|
||||
|
||||
router.post("/health-checks/:id/run", requireAuth, authorize("security:manage"), async (req, res, next) => {
|
||||
try {
|
||||
const existing = await prisma.serverHealthCheck.findUnique({
|
||||
where: { id: req.params.id },
|
||||
select: { id: true, tenant_id: true }
|
||||
});
|
||||
|
||||
if (!existing) {
|
||||
throw new HttpError(404, "Health check not found", "HEALTH_CHECK_NOT_FOUND");
|
||||
}
|
||||
|
||||
if (isTenantScopedUser(req) && req.user?.tenant_id && existing.tenant_id && existing.tenant_id !== req.user.tenant_id) {
|
||||
throw new HttpError(403, "Access denied for tenant scope", "TENANT_SCOPE_VIOLATION");
|
||||
}
|
||||
|
||||
const result = await runHealthCheckNow(existing.id);
|
||||
return res.json(result);
|
||||
} catch (error) {
|
||||
return next(error);
|
||||
}
|
||||
});
|
||||
|
||||
router.get("/health-checks/:id/results", requireAuth, authorize("security:read"), async (req, res, next) => {
|
||||
try {
|
||||
const existing = await prisma.serverHealthCheck.findUnique({
|
||||
where: { id: req.params.id },
|
||||
select: { id: true, tenant_id: true }
|
||||
});
|
||||
|
||||
if (!existing) {
|
||||
throw new HttpError(404, "Health check not found", "HEALTH_CHECK_NOT_FOUND");
|
||||
}
|
||||
|
||||
if (isTenantScopedUser(req) && req.user?.tenant_id && existing.tenant_id && existing.tenant_id !== req.user.tenant_id) {
|
||||
throw new HttpError(403, "Access denied for tenant scope", "TENANT_SCOPE_VIOLATION");
|
||||
}
|
||||
|
||||
const limit = typeof req.query.limit === "string" ? Number(req.query.limit) : undefined;
|
||||
const data = await listHealthCheckResults(existing.id, limit);
|
||||
return res.json({ data });
|
||||
} catch (error) {
|
||||
return next(error);
|
||||
}
|
||||
});
|
||||
|
||||
router.get("/alerts/rules", requireAuth, authorize("security:read"), async (req, res, next) => {
|
||||
try {
|
||||
const data = await listAlertRules({
|
||||
tenant_id: scopedTenantId(req) ?? queryTenantId(req),
|
||||
enabled: typeof req.query.enabled === "string" ? req.query.enabled === "true" : undefined
|
||||
});
|
||||
return res.json({ data });
|
||||
} catch (error) {
|
||||
return next(error);
|
||||
}
|
||||
});
|
||||
|
||||
router.post("/alerts/rules", requireAuth, authorize("security:manage"), async (req, res, next) => {
|
||||
try {
|
||||
const payload = alertRuleSchema.parse(req.body ?? {});
|
||||
|
||||
if (payload.vm_id) {
|
||||
await ensureVmTenantScope(payload.vm_id, req);
|
||||
}
|
||||
|
||||
const tenantId = scopedTenantId(req) ?? payload.tenant_id;
|
||||
const rule = await createAlertRule({
|
||||
...payload,
|
||||
tenant_id: tenantId,
|
||||
created_by: req.user?.email
|
||||
});
|
||||
|
||||
await logAudit({
|
||||
action: "monitoring.alert_rule.create",
|
||||
resource_type: "SECURITY",
|
||||
resource_id: rule.id,
|
||||
resource_name: rule.name,
|
||||
actor_email: req.user!.email,
|
||||
actor_role: req.user!.role,
|
||||
details: toPrismaJsonValue(payload),
|
||||
ip_address: req.ip
|
||||
});
|
||||
|
||||
return res.status(201).json(rule);
|
||||
} catch (error) {
|
||||
return next(error);
|
||||
}
|
||||
});
|
||||
|
||||
router.patch("/alerts/rules/:id", requireAuth, authorize("security:manage"), async (req, res, next) => {
|
||||
try {
|
||||
const payload = alertRuleSchema.partial().parse(req.body ?? {});
|
||||
const existing = await prisma.monitoringAlertRule.findUnique({
|
||||
where: { id: req.params.id },
|
||||
select: {
|
||||
id: true,
|
||||
tenant_id: true
|
||||
}
|
||||
});
|
||||
|
||||
if (!existing) {
|
||||
throw new HttpError(404, "Alert rule not found", "ALERT_RULE_NOT_FOUND");
|
||||
}
|
||||
|
||||
if (isTenantScopedUser(req) && req.user?.tenant_id && existing.tenant_id && existing.tenant_id !== req.user.tenant_id) {
|
||||
throw new HttpError(403, "Access denied for tenant scope", "TENANT_SCOPE_VIOLATION");
|
||||
}
|
||||
|
||||
if (payload.vm_id) {
|
||||
await ensureVmTenantScope(payload.vm_id, req);
|
||||
}
|
||||
|
||||
const updated = await updateAlertRule(req.params.id, {
|
||||
...payload,
|
||||
tenant_id: scopedTenantId(req) ?? payload.tenant_id
|
||||
});
|
||||
return res.json(updated);
|
||||
} catch (error) {
|
||||
return next(error);
|
||||
}
|
||||
});
|
||||
|
||||
router.get("/alerts/events", requireAuth, authorize("security:read"), async (req, res, next) => {
|
||||
try {
|
||||
const statusRaw = typeof req.query.status === "string" ? req.query.status.toUpperCase() : undefined;
|
||||
const status = Object.values(MonitoringAlertStatus).includes(statusRaw as MonitoringAlertStatus)
|
||||
? (statusRaw as MonitoringAlertStatus)
|
||||
: undefined;
|
||||
|
||||
const limit = typeof req.query.limit === "string" ? Number(req.query.limit) : undefined;
|
||||
const data = await listAlertEvents({
|
||||
tenant_id: scopedTenantId(req) ?? queryTenantId(req),
|
||||
status,
|
||||
limit
|
||||
});
|
||||
return res.json({ data });
|
||||
} catch (error) {
|
||||
return next(error);
|
||||
}
|
||||
});
|
||||
|
||||
router.get("/alerts/notifications", requireAuth, authorize("security:read"), async (req, res, next) => {
|
||||
try {
|
||||
const limit = typeof req.query.limit === "string" ? Number(req.query.limit) : undefined;
|
||||
const data = await listAlertNotifications({
|
||||
tenant_id: scopedTenantId(req) ?? queryTenantId(req),
|
||||
limit
|
||||
});
|
||||
return res.json({ data });
|
||||
} catch (error) {
|
||||
return next(error);
|
||||
}
|
||||
});
|
||||
|
||||
router.post("/alerts/evaluate", requireAuth, authorize("security:manage"), async (req, res, next) => {
|
||||
try {
|
||||
const result = await evaluateAlertRulesNow(scopedTenantId(req));
|
||||
return res.json(result);
|
||||
} catch (error) {
|
||||
return next(error);
|
||||
}
|
||||
});
|
||||
|
||||
router.get("/insights/faulty-deployments", requireAuth, authorize("security:read"), async (req, res, next) => {
|
||||
try {
|
||||
const days = typeof req.query.days === "string" ? Number(req.query.days) : undefined;
|
||||
const data = await faultyDeploymentInsights({
|
||||
days,
|
||||
tenant_id: scopedTenantId(req) ?? queryTenantId(req)
|
||||
});
|
||||
return res.json(data);
|
||||
} catch (error) {
|
||||
return next(error);
|
||||
}
|
||||
});
|
||||
|
||||
router.get("/insights/cluster-forecast", requireAuth, authorize("security:read"), async (req, res, next) => {
|
||||
try {
|
||||
const horizon = typeof req.query.horizon_days === "string" ? Number(req.query.horizon_days) : undefined;
|
||||
const data = await clusterResourceForecast({
|
||||
horizon_days: horizon,
|
||||
tenant_id: scopedTenantId(req) ?? queryTenantId(req)
|
||||
});
|
||||
return res.json(data);
|
||||
} catch (error) {
|
||||
return next(error);
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
Reference in New Issue
Block a user