refactor progress

2026-02-17 12:22:36 +00:00
parent cb31608523
commit d5abd1a7e4
8257 changed files with 1190207 additions and 761040 deletions
--- a/fluxer_devops/signoz/alerts/critical-alerts.json
+++ b/fluxer_devops/signoz/alerts/critical-alerts.json
@@ -0,0 +1,407 @@
+{
+	"name": "Fluxer Critical Alerts",
+	"description": "Critical alerts for Fluxer services",
+	"version": 2,
+	"alerts": [
+		{
+			"id": "high-api-error-rate",
+			"name": "High API Error Rate",
+			"type": "metric",
+			"condition": {
+				"query": "sum(rate(http_server_request_count{service_name='fluxer-api',http_response_status_code=~'5..'}[5m])) > 10",
+				"evaluation_interval": "1m",
+				"for": "5m"
+			},
+			"severity": "critical",
+			"annotations": {
+				"summary": "API error rate is above 10 req/s",
+				"description": "The fluxer-api service is experiencing a high error rate (5xx responses). This may indicate a service degradation or outage."
+			},
+			"labels": {
+				"service": "fluxer-api",
+				"alert_type": "error_rate"
+			},
+			"actions": [
+				{
+					"type": "notification",
+					"channel": "slack",
+					"target": "#alerts-critical"
+				}
+			]
+		},
+		{
+			"id": "high-api-latency",
+			"name": "High API Latency",
+			"type": "metric",
+			"condition": {
+				"query": "histogram_quantile(0.95, sum(rate(http_server_request_duration_bucket{service_name='fluxer-api'}[5m])) > 1000",
+				"evaluation_interval": "1m",
+				"for": "10m"
+			},
+			"severity": "warning",
+			"annotations": {
+				"summary": "API P95 latency is above 1000ms",
+				"description": "The fluxer-api service is experiencing high latency. 95% of requests are taking longer than 1 second."
+			},
+			"labels": {
+				"service": "fluxer-api",
+				"alert_type": "latency"
+			},
+			"actions": [
+				{
+					"type": "notification",
+					"channel": "slack",
+					"target": "#alerts-warning"
+				}
+			]
+		},
+		{
+			"id": "queue-depth-high",
+			"name": "Queue Depth Too High",
+			"type": "metric",
+			"condition": {
+				"query": "fluxer_queue_depth > 10000",
+				"evaluation_interval": "1m",
+				"for": "15m"
+			},
+			"severity": "warning",
+			"annotations": {
+				"summary": "Queue depth is above 10,000 jobs",
+				"description": "The job queue has accumulated more than 10,000 jobs. This may indicate processing is slower than job arrival."
+			},
+			"labels": {
+				"service": "fluxer-queue",
+				"alert_type": "queue_depth"
+			},
+			"actions": [
+				{
+					"type": "notification",
+					"channel": "slack",
+					"target": "#alerts-warning"
+				}
+			]
+		},
+		{
+			"id": "queue-dlq-rate",
+			"name": "High Dead Letter Queue Rate",
+			"type": "metric",
+			"condition": {
+				"query": "sum(rate(fluxer_queue_dead_letter[5m])) > 5",
+				"evaluation_interval": "1m",
+				"for": "10m"
+			},
+			"severity": "critical",
+			"annotations": {
+				"summary": "DLQ rate is above 5 jobs/sec",
+				"description": "Jobs are being moved to the dead letter queue at a high rate. This may indicate persistent job failures."
+			},
+			"labels": {
+				"service": "fluxer-queue",
+				"alert_type": "dlq_rate"
+			},
+			"actions": [
+				{
+					"type": "notification",
+					"channel": "slack",
+					"target": "#alerts-critical"
+				}
+			]
+		},
+		{
+			"id": "gateway-connection-drop",
+			"name": "Gateway Connection Drop Rate",
+			"type": "metric",
+			"condition": {
+				"query": "rate(gateway_websocket_disconnections[1m]) / rate(gateway_websocket_connections[1m]) > 0.5",
+				"evaluation_interval": "1m",
+				"for": "5m"
+			},
+			"severity": "critical",
+			"annotations": {
+				"summary": "Gateway disconnect rate exceeds 50% of connect rate",
+				"description": "WebSocket connections are dropping at an unusually high rate. This may indicate network issues or service instability."
+			},
+			"labels": {
+				"service": "fluxer-gateway",
+				"alert_type": "connection_stability"
+			},
+			"actions": [
+				{
+					"type": "notification",
+					"channel": "slack",
+					"target": "#alerts-critical"
+				}
+			]
+		},
+		{
+			"id": "gateway-rpc-latency-high",
+			"name": "Gateway RPC Latency High",
+			"type": "metric",
+			"condition": {
+				"query": "gateway_rpc_latency_p95 > 500",
+				"evaluation_interval": "1m",
+				"for": "10m"
+			},
+			"severity": "warning",
+			"annotations": {
+				"summary": "Gateway RPC P95 latency above 500ms",
+				"description": "RPC calls from gateway to backend are experiencing high latency."
+			},
+			"labels": {
+				"service": "fluxer-gateway",
+				"alert_type": "latency"
+			},
+			"actions": [
+				{
+					"type": "notification",
+					"channel": "slack",
+					"target": "#alerts-warning"
+				}
+			]
+		},
+		{
+			"id": "media-proxy-error-rate",
+			"name": "Media Proxy High Error Rate",
+			"type": "metric",
+			"condition": {
+				"query": "sum(rate(media_proxy_failure{service_name='fluxer-media-proxy'}[5m])) / sum(rate(http_server_request_count{service_name='fluxer-media-proxy'}[5m])) > 0.1",
+				"evaluation_interval": "1m",
+				"for": "10m"
+			},
+			"severity": "warning",
+			"annotations": {
+				"summary": "Media proxy error rate above 10%",
+				"description": "The media proxy is failing more than 10% of requests. This may indicate origin issues or cache problems."
+			},
+			"labels": {
+				"service": "fluxer-media-proxy",
+				"alert_type": "error_rate"
+			},
+			"actions": [
+				{
+					"type": "notification",
+					"channel": "slack",
+					"target": "#alerts-warning"
+				}
+			]
+		},
+		{
+			"id": "media-proxy-timeout-rate",
+			"name": "Media Proxy High Timeout Rate",
+			"type": "metric",
+			"condition": {
+				"query": "sum(rate(media_proxy_failure{error_type='timeout'}[5m])) > 5",
+				"evaluation_interval": "1m",
+				"for": "5m"
+			},
+			"severity": "warning",
+			"annotations": {
+				"summary": "Media proxy timeout rate above 5 req/s",
+				"description": "The media proxy is experiencing a high rate of timeouts. This may indicate network issues or slow origin servers."
+			},
+			"labels": {
+				"service": "fluxer-media-proxy",
+				"alert_type": "timeout"
+			},
+			"actions": [
+				{
+					"type": "notification",
+					"channel": "slack",
+					"target": "#alerts-warning"
+				}
+			]
+		},
+		{
+			"id": "telemetry-ingestion-stopped",
+			"name": "Telemetry Ingestion Stopped",
+			"type": "metric",
+			"condition": {
+				"query": "increase(signoz_traces_signoz_index_v2[15m]) == 0",
+				"evaluation_interval": "1m",
+				"for": "5m"
+			},
+			"severity": "critical",
+			"annotations": {
+				"summary": "No traces being ingested",
+				"description": "The SigNoz collector has not received any traces in the last 15 minutes. This may indicate a collector issue or service instrumentation failure."
+			},
+			"labels": {
+				"service": "signoz",
+				"alert_type": "telemetry"
+			},
+			"actions": [
+				{
+					"type": "notification",
+					"channel": "slack",
+					"target": "#alerts-critical"
+				}
+			]
+		},
+		{
+			"id": "cron-job-overdue",
+			"name": "Cron Job Overdue",
+			"type": "metric",
+			"condition": {
+				"query": "time() - max(fluxer_queue_cron_tick_timestamp by (cron)) > 3600",
+				"evaluation_interval": "5m",
+				"for": "5m"
+			},
+			"severity": "warning",
+			"annotations": {
+				"summary": "Cron job has not executed in over 1 hour",
+				"description": "A scheduled cron job has not run in over an hour. This may indicate a hung cron process or scheduling issue."
+			},
+			"labels": {
+				"service": "fluxer-queue",
+				"alert_type": "cron"
+			},
+			"actions": [
+				{
+					"type": "notification",
+					"channel": "slack",
+					"target": "#alerts-warning"
+				}
+			]
+		},
+		{
+			"id": "csam-match-detected",
+			"name": "CSAM Match Detected",
+			"type": "metric",
+			"condition": {
+				"query": "sum(rate(fluxer_csam_matches_total{service_name='fluxer-api'}[1m])) > 0",
+				"evaluation_interval": "1m",
+				"for": "0m"
+			},
+			"severity": "critical",
+			"annotations": {
+				"summary": "CSAM content has been detected",
+				"description": "CSAM content has been detected. Immediate review required."
+			},
+			"labels": {
+				"service": "fluxer-api",
+				"alert_type": "csam_match"
+			},
+			"actions": [
+				{
+					"type": "notification",
+					"channel": "slack",
+					"target": "#alerts-critical"
+				}
+			]
+		},
+		{
+			"id": "photodna-api-error-rate-high",
+			"name": "PhotoDNA API Error Rate High",
+			"type": "metric",
+			"condition": {
+				"query": "sum(rate(fluxer_csam_photodna_api_total{service_name='fluxer-api',status='error'}[5m])) / sum(rate(fluxer_csam_photodna_api_total{service_name='fluxer-api'}[5m])) > 0.1",
+				"evaluation_interval": "1m",
+				"for": "5m"
+			},
+			"severity": "warning",
+			"annotations": {
+				"summary": "PhotoDNA API error rate exceeds 10%",
+				"description": "PhotoDNA API error rate exceeds 10%"
+			},
+			"labels": {
+				"service": "fluxer-api",
+				"alert_type": "photodna_error_rate"
+			},
+			"actions": [
+				{
+					"type": "notification",
+					"channel": "slack",
+					"target": "#alerts-warning"
+				}
+			]
+		},
+		{
+			"id": "ncmec-submission-failure",
+			"name": "NCMEC Submission Failure",
+			"type": "metric",
+			"condition": {
+				"query": "sum(rate(fluxer_csam_ncmec_submissions{service_name='fluxer-api',status='error'}[5m])) > 0",
+				"evaluation_interval": "1m",
+				"for": "5m"
+			},
+			"severity": "critical",
+			"annotations": {
+				"summary": "NCMEC report submission has failed",
+				"description": "NCMEC report submission has failed. Manual intervention required."
+			},
+			"labels": {
+				"service": "fluxer-api",
+				"alert_type": "ncmec_submission"
+			},
+			"actions": [
+				{
+					"type": "notification",
+					"channel": "slack",
+					"target": "#alerts-critical"
+				}
+			]
+		},
+		{
+			"id": "csam-scan-failure-rate-high",
+			"name": "CSAM Scan Failure Rate High",
+			"type": "metric",
+			"condition": {
+				"query": "sum(rate(fluxer_csam_scans_total{service_name='fluxer-api',status='error'}[5m])) / sum(rate(fluxer_csam_scans_total{service_name='fluxer-api'}[5m])) > 0.05",
+				"evaluation_interval": "1m",
+				"for": "5m"
+			},
+			"severity": "warning",
+			"annotations": {
+				"summary": "CSAM scan failure rate exceeds 5%",
+				"description": "CSAM scan failure rate exceeds 5%"
+			},
+			"labels": {
+				"service": "fluxer-api",
+				"alert_type": "csam_scan_failure_rate"
+			},
+			"actions": [
+				{
+					"type": "notification",
+					"channel": "slack",
+					"target": "#alerts-warning"
+				}
+			]
+		},
+		{
+			"id": "photodna-api-latency-high",
+			"name": "PhotoDNA API Latency High",
+			"type": "metric",
+			"condition": {
+				"query": "histogram_quantile(0.95, sum(rate(fluxer_csam_photodna_api_duration_ms_bucket{service_name='fluxer-api'}[5m])) by (le)) > 5000",
+				"evaluation_interval": "1m",
+				"for": "5m"
+			},
+			"severity": "warning",
+			"annotations": {
+				"summary": "PhotoDNA API p95 latency exceeds 5 seconds",
+				"description": "PhotoDNA API p95 latency exceeds 5 seconds"
+			},
+			"labels": {
+				"service": "fluxer-api",
+				"alert_type": "photodna_latency"
+			},
+			"actions": [
+				{
+					"type": "notification",
+					"channel": "slack",
+					"target": "#alerts-warning"
+				}
+			]
+		}
+	],
+	"notification_channels": {
+		"slack": {
+			"type": "webhook",
+			"url": "${ALERT_WEBHOOK_URL}",
+			"channel_mapping": {
+				"critical": "#alerts-critical",
+				"warning": "#alerts-warning"
+			}
+		}
+	}
+}
--- a/fluxer_devops/signoz/alerts/default-alerts.yaml
+++ b/fluxer_devops/signoz/alerts/default-alerts.yaml
@@ -0,0 +1,329 @@
+groups:
+  - name: fluxer_api_alerts
+    interval: 30s
+    rules:
+      - alert: FluxerHighErrorRate
+        expr: |
+          (
+            rate(http_server_request_count{http_response_status_code=~"5.."}[5m])
+            /
+            rate(http_server_request_count[5m])
+          ) > 0.05
+          and rate(http_server_request_count[5m]) > 10
+        for: 5m
+        labels:
+          severity: critical
+          service: fluxer-api
+          alert_type: error_rate
+        annotations:
+          summary: 'High error rate on {{ $labels.service_name }}'
+          description: 'Error rate is above 5% (minimum 10 requests/5m) on {{ $labels.service_name }}. Current value: {{ $value | humanizePercentage }}'
+          runbook: 'https://docs.fluxer.dev/runbooks/high-error-rate'
+
+      - alert: FluxerElevatedErrorRate
+        expr: |
+          (
+            rate(http_server_request_count{http_response_status_code=~"5.."}[5m])
+            /
+            rate(http_server_request_count[5m])
+          ) > 0.01
+          and rate(http_server_request_count[5m]) > 10
+        for: 10m
+        labels:
+          severity: warning
+          service: fluxer-api
+          alert_type: error_rate
+        annotations:
+          summary: 'Elevated error rate on {{ $labels.service_name }}'
+          description: 'Error rate is above 1% on {{ $labels.service_name }}. Current value: {{ $value | humanizePercentage }}'
+          runbook: 'https://docs.fluxer.dev/runbooks/high-error-rate'
+
+  - name: fluxer_queue_alerts
+    interval: 30s
+    rules:
+      - alert: FluxerQueueDepthCritical
+        expr: |
+          fluxer_queue_depth{service_name="fluxer-queue"} > 10000
+        for: 5m
+        labels:
+          severity: critical
+          service: fluxer-queue
+          alert_type: queue_depth
+        annotations:
+          summary: 'Queue depth critically high for {{ $labels.queue_name }}'
+          description: 'Queue {{ $labels.queue_name }} has {{ $value }} jobs pending (threshold: 10,000). Jobs may be delayed or processing is stalled.'
+          runbook: 'https://docs.fluxer.dev/runbooks/queue-depth-critical'
+
+      - alert: FluxerQueueDepthElevated
+        expr: |
+          fluxer_queue_depth{service_name="fluxer-queue"} > 5000
+        for: 10m
+        labels:
+          severity: warning
+          service: fluxer-queue
+          alert_type: queue_depth
+        annotations:
+          summary: 'Queue depth elevated for {{ $labels.queue_name }}'
+          description: 'Queue {{ $labels.queue_name }} has {{ $value }} jobs pending (threshold: 5,000). Monitor for escalation.'
+
+      - alert: FluxerDLQRateCritical
+        expr: |
+          sum(rate(fluxer_queue_dead_letter{service_name="fluxer-queue"}[5m])) > 5
+        for: 5m
+        labels:
+          severity: critical
+          service: fluxer-queue
+          alert_type: dlq_rate
+        annotations:
+          summary: 'High dead letter queue rate'
+          description: 'Jobs are failing and moving to DLQ at rate {{ $value | humanize }} jobs/sec. Check job failures and error logs.'
+          runbook: 'https://docs.fluxer.dev/runbooks/high-dlq-rate'
+
+  - name: fluxer_gateway_alerts
+    interval: 30s
+    rules:
+      - alert: FluxerGatewayConnectionDropCritical
+        expr: |
+          sum(rate(gateway_websocket_disconnections{reason="error"}[1m])) by (service_name) > 10
+        for: 3m
+        labels:
+          severity: critical
+          service: fluxer-gateway
+          alert_type: connection_drop
+        annotations:
+          summary: 'Critical WebSocket error disconnect rate'
+          description: 'Gateway experiencing {{ $value | humanize }} error disconnects/min. This may indicate service instability or network issues.'
+          runbook: 'https://docs.fluxer.dev/runbooks/gateway-connection-drop'
+
+      - alert: FluxerGatewayDisconnectElevated
+        expr: |
+          sum(rate(gateway_websocket_disconnections{reason="error"}[1m])) by (service_name) > 5
+        for: 5m
+        labels:
+          severity: warning
+          service: fluxer-gateway
+          alert_type: connection_drop
+        annotations:
+          summary: 'Elevated WebSocket error disconnect rate'
+          description: 'Gateway experiencing {{ $value | humanize }} error disconnects/min. Monitor for escalation.'
+
+      - alert: FluxerGatewayDisconnectRatioHigh
+        expr: |
+          (
+            sum(rate(gateway_websocket_disconnections{reason="error"}[5m])) by (service_name)
+            /
+            sum(rate(gateway_websocket_connections[5m])) by (service_name)
+          ) > 0.1
+        for: 5m
+        labels:
+          severity: critical
+          service: fluxer-gateway
+          alert_type: disconnect_ratio
+        annotations:
+          summary: 'Gateway disconnect ratio above 10%'
+          description: 'Error disconnects represent {{ $value | humanizePercentage }} of new connections. Check gateway stability.'
+          runbook: 'https://docs.fluxer.dev/runbooks/gateway-connection-drop'
+
+      - alert: FluxerGatewayRPCLatencyHigh
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(gateway_rpc_latency_bucket{service_name="fluxer-gateway"}[5m])) by (le)
+          ) > 500
+        for: 10m
+        labels:
+          severity: warning
+          service: fluxer-gateway
+          alert_type: rpc_latency
+        annotations:
+          summary: 'Gateway RPC P95 latency above 500ms'
+          description: 'Gateway RPC calls experiencing high latency. Current P95: {{ $value | humanize }}ms'
+          runbook: 'https://docs.fluxer.dev/runbooks/gateway-rpc-latency'
+
+  - name: fluxer_log_alerts
+    interval: 30s
+    rules:
+      - alert: FluxerLogErrorSpikeCritical
+        expr: |
+          sum(rate(logs_count{severity_text="ERROR"}[5m])) by (service_name) > 50
+        for: 2m
+        labels:
+          severity: critical
+          alert_type: log_error_spike
+        annotations:
+          summary: 'Critical error log volume spike on {{ $labels.service_name }}'
+          description: 'Service {{ $labels.service_name }} logging {{ $value | humanize }} errors/sec. Check logs and traces for root cause.'
+          runbook: 'https://docs.fluxer.dev/runbooks/log-error-spike'
+
+      - alert: FluxerLogErrorElevated
+        expr: |
+          sum(rate(logs_count{severity_text="ERROR"}[5m])) by (service_name) > 20
+        for: 10m
+        labels:
+          severity: warning
+          alert_type: log_error_elevated
+        annotations:
+          summary: 'Elevated error log volume on {{ $labels.service_name }}'
+          description: 'Service {{ $labels.service_name }} logging {{ $value | humanize }} errors/sec. Monitor for escalation.'
+
+      - alert: FluxerLogWarningElevated
+        expr: |
+          sum(rate(logs_count{severity_text="WARN"}[5m])) by (service_name) > 100
+        for: 10m
+        labels:
+          severity: warning
+          alert_type: log_warning_elevated
+        annotations:
+          summary: 'Elevated warning log volume on {{ $labels.service_name }}'
+          description: 'Service {{ $labels.service_name }} logging {{ $value | humanize }} warnings/sec. Review warning patterns.'
+
+  - name: fluxer_api_performance_alerts
+    interval: 30s
+    rules:
+      - alert: FluxerAPILatencyCritical
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(http_server_request_duration_bucket{service_name="fluxer-api"}[5m])) by (le, http_route)
+          ) > 2000
+        for: 5m
+        labels:
+          severity: critical
+          service: fluxer-api
+          alert_type: latency
+        annotations:
+          summary: 'Critical API latency on route {{ $labels.http_route }}'
+          description: 'P95 latency for route {{ $labels.http_route }} is above 2 seconds. Current: {{ $value | humanize }}ms'
+          runbook: 'https://docs.fluxer.dev/runbooks/high-api-latency'
+
+      - alert: FluxerAPILatencyElevated
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(http_server_request_duration_bucket{service_name="fluxer-api"}[5m])) by (le, http_route)
+          ) > 1000
+        for: 10m
+        labels:
+          severity: warning
+          service: fluxer-api
+          alert_type: latency
+        annotations:
+          summary: 'Elevated API latency on route {{ $labels.http_route }}'
+          description: 'P95 latency for route {{ $labels.http_route }} is above 1 second. Current: {{ $value | humanize }}ms'
+
+  - name: fluxer_database_alerts
+    interval: 30s
+    rules:
+      - alert: FluxerDBLatencyCritical
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(db_query_latency_bucket[5m])) by (le, query_type)
+          ) > 1000
+        for: 5m
+        labels:
+          severity: critical
+          alert_type: database_latency
+        annotations:
+          summary: 'Critical database query latency for {{ $labels.query_type }}'
+          description: 'P95 {{ $labels.query_type }} query latency above 1 second. Current: {{ $value | humanize }}ms'
+          runbook: 'https://docs.fluxer.dev/runbooks/database-latency'
+
+      - alert: FluxerDBConnectionPoolHigh
+        expr: |
+          db_connection_pool_active / db_connection_pool_max > 0.8
+        for: 10m
+        labels:
+          severity: warning
+          alert_type: connection_pool
+        annotations:
+          summary: 'Database connection pool usage above 80%'
+          description: 'Connection pool at {{ $value | humanizePercentage }} capacity. May lead to connection waits.'
+          runbook: 'https://docs.fluxer.dev/runbooks/connection-pool'
+
+  - name: fluxer_cache_alerts
+    interval: 30s
+    rules:
+      - alert: FluxerCacheHitRateLow
+        expr: |
+          sum(rate(cache_operation{status="hit"}[5m])) by (cache_name)
+          /
+          sum(rate(cache_operation{status=~"hit|miss"}[5m])) by (cache_name) < 0.5
+        for: 15m
+        labels:
+          severity: warning
+          alert_type: cache_efficiency
+        annotations:
+          summary: 'Low cache hit rate for {{ $labels.cache_name }}'
+          description: 'Cache {{ $labels.cache_name }} hit rate below 50%. Current: {{ $value | humanizePercentage }}'
+          runbook: 'https://docs.fluxer.dev/runbooks/low-cache-hit-rate'
+
+  - name: fluxer_worker_alerts
+    interval: 30s
+    rules:
+      - alert: FluxerWorkerFailureRateCritical
+        expr: |
+          sum(rate(fluxer_worker_task_failure[5m])) by (task_name) > 1
+        for: 5m
+        labels:
+          severity: critical
+          alert_type: worker_failure
+        annotations:
+          summary: 'Critical worker task failure rate for {{ $labels.task_name }}'
+          description: 'Worker task {{ $labels.task_name }} failing at {{ $value | humanize }} tasks/sec. Check task logs.'
+          runbook: 'https://docs.fluxer.dev/runbooks/worker-failures'
+
+      - alert: FluxerCronJobOverdue
+        expr: |
+          time() - max(fluxer_queue_cron_tick_timestamp by (cron)) > 3600
+        for: 5m
+        labels:
+          severity: warning
+          service: fluxer-queue
+          alert_type: cron
+        annotations:
+          summary: 'Cron job {{ $labels.cron }} has not executed in over 1 hour'
+          description: "Scheduled cron job hasn't run since {{ $value | humanizeTimestamp }}. May indicate hung process."
+          runbook: 'https://docs.fluxer.dev/runbooks/cron-overdue'
+
+  - name: fluxer_telemetry_alerts
+    interval: 60s
+    rules:
+      - alert: FluxerTelemetryIngestionStopped
+        expr: |
+          increase(signoz_traces_signoz_index_v2[15m]) == 0
+        for: 5m
+        labels:
+          severity: critical
+          alert_type: telemetry
+        annotations:
+          summary: 'No traces being ingested'
+          description: "SigNoz collector hasn't received traces in 15 minutes. Check collector health and service instrumentation."
+          runbook: 'https://docs.fluxer.dev/runbooks/telemetry-down'
+
+  - name: fluxer_media_proxy_alerts
+    interval: 30s
+    rules:
+      - alert: FluxerMediaProxyErrorRate
+        expr: |
+          sum(rate(media_proxy_failure{service_name="fluxer-media-proxy"}[5m]))
+          /
+          sum(rate(http_server_request_count{service_name="fluxer-media-proxy"}[5m])) > 0.1
+        for: 10m
+        labels:
+          severity: warning
+          service: fluxer-media-proxy
+          alert_type: error_rate
+        annotations:
+          summary: 'Media proxy error rate above 10%'
+          description: 'Media proxy failing {{ $value | humanizePercentage }} of requests. Check origin servers and cache.'
+          runbook: 'https://docs.fluxer.dev/runbooks/media-proxy-errors'
+
+      - alert: FluxerMediaProxyTimeoutRate
+        expr: |
+          sum(rate(media_proxy_failure{error_type="timeout"}[5m])) > 5
+        for: 5m
+        labels:
+          severity: warning
+          service: fluxer-media-proxy
+          alert_type: timeout
+        annotations:
+          summary: 'Media proxy timeout rate above 5 req/s'
+          description: 'Media proxy experiencing high timeout rate. May indicate network issues or slow origins.'
+          runbook: 'https://docs.fluxer.dev/runbooks/media-proxy-timeouts'
--- a/fluxer_devops/signoz/compose.yaml
+++ b/fluxer_devops/signoz/compose.yaml
@@ -0,0 +1,213 @@
+x-common: &common
+  networks:
+    - fluxer-shared
+  logging:
+    options:
+      max-size: 50m
+      max-file: '3'
+
+x-deploy-base: &deploy_base
+  restart_policy:
+    condition: on-failure
+    delay: 5s
+    max_attempts: 3
+
+x-clickhouse-defaults: &clickhouse_defaults
+  <<: *common
+  image: clickhouse/clickhouse-server:25.5.6
+  tty: true
+  environment:
+    - CLICKHOUSE_SKIP_USER_SETUP=1
+  deploy:
+    <<: *deploy_base
+    labels:
+      signoz.io/scrape: 'true'
+      signoz.io/port: '9363'
+      signoz.io/path: '/metrics'
+  healthcheck:
+    test:
+      - CMD
+      - wget
+      - --spider
+      - -q
+      - 0.0.0.0:8123/ping
+    interval: 30s
+    timeout: 5s
+    retries: 3
+  ulimits:
+    nproc: 65535
+    nofile:
+      soft: 262144
+      hard: 262144
+
+x-zookeeper-defaults: &zookeeper_defaults
+  <<: *common
+  image: signoz/zookeeper:3.7.1
+  user: root
+  deploy:
+    <<: *deploy_base
+    labels:
+      signoz.io/scrape: 'true'
+      signoz.io/port: '9141'
+      signoz.io/path: '/metrics'
+  healthcheck:
+    test:
+      - CMD-SHELL
+      - curl -s -m 2 http://localhost:8080/commands/ruok | grep error | grep null
+    interval: 30s
+    timeout: 5s
+    retries: 3
+
+services:
+  init-clickhouse:
+    <<: *common
+    image: clickhouse/clickhouse-server:25.5.6
+    command:
+      - bash
+      - -c
+      - |
+        version="v0.0.1"
+        node_os=$$(uname -s | tr '[:upper:]' '[:lower:]')
+        node_arch=$$(uname -m | sed s/aarch64/arm64/ | sed s/x86_64/amd64/)
+        echo "Fetching histogram-binary for $${node_os}/$${node_arch}"
+        cd /tmp
+        wget -O histogram-quantile.tar.gz "https://github.com/SigNoz/signoz/releases/download/histogram-quantile%2F$${version}/histogram-quantile_$${node_os}_$${node_arch}.tar.gz"
+        tar -xvzf histogram-quantile.tar.gz
+        mv histogram-quantile /var/lib/clickhouse/user_scripts/histogramQuantile
+    deploy:
+      restart_policy:
+        condition: on-failure
+    volumes:
+      - ./conf/clickhouse/user_scripts:/var/lib/clickhouse/user_scripts:rw
+
+  zookeeper-1:
+    <<: *zookeeper_defaults
+    environment:
+      - ZOO_SERVER_ID=1
+      - ALLOW_ANONYMOUS_LOGIN=yes
+      - ZOO_AUTOPURGE_INTERVAL=1
+      - ZOO_ENABLE_PROMETHEUS_METRICS=yes
+      - ZOO_PROMETHEUS_METRICS_PORT_NUMBER=9141
+    volumes:
+      - zookeeper-1:/bitnami/zookeeper
+
+  clickhouse:
+    <<: *clickhouse_defaults
+    hostname: clickhouse
+    configs:
+      - source: clickhouse-config
+        target: /etc/clickhouse-server/config.xml
+      - source: clickhouse-users
+        target: /etc/clickhouse-server/users.xml
+      - source: clickhouse-custom-function
+        target: /etc/clickhouse-server/custom-function.xml
+      - source: clickhouse-cluster
+        target: /etc/clickhouse-server/config.d/cluster.xml
+    volumes:
+      - clickhouse:/var/lib/clickhouse/
+
+  schema-migrator:
+    <<: *common
+    image: signoz/signoz-schema-migrator:${OTELCOL_TAG:-v0.129.12}
+    entrypoint: sh
+    command:
+      - -c
+      - /signoz-schema-migrator sync --dsn=tcp://clickhouse:9000 --up= && /signoz-schema-migrator async --dsn=tcp://clickhouse:9000 --up=
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 5s
+
+  signoz:
+    <<: *common
+    image: signoz/signoz:${SIGNOZ_IMAGE_TAG:-v0.108.0}
+    command:
+      - --config=/root/config/prometheus.yml
+    environment:
+      - SIGNOZ_ALERTMANAGER_PROVIDER=signoz
+      - SIGNOZ_TELEMETRYSTORE_CLICKHOUSE_DSN=tcp://clickhouse:9000
+      - SIGNOZ_SQLSTORE_SQLITE_PATH=/var/lib/signoz/signoz.db
+      - DASHBOARDS_PATH=/root/config/dashboards
+      - STORAGE=clickhouse
+      - GODEBUG=netdns=go
+      - TELEMETRY_ENABLED=true
+      - DEPLOYMENT_TYPE=docker-swarm
+      - DOT_METRICS_ENABLED=true
+    configs:
+      - source: signoz-prometheus-config
+        target: /root/config/prometheus.yml
+    volumes:
+      - sqlite:/var/lib/signoz/
+      - ./dashboards:/root/config/dashboards:ro
+    deploy:
+      <<: *deploy_base
+      replicas: 1
+      labels:
+        - 'caddy=signoz.fluxer.app'
+        - 'caddy.reverse_proxy={{upstreams 8080}}'
+        - 'caddy.header.Strict-Transport-Security="max-age=31536000; includeSubDomains; preload"'
+        - 'caddy.header.X-Xss-Protection="1; mode=block"'
+        - 'caddy.header.X-Content-Type-Options=nosniff'
+        - 'caddy.header.Referrer-Policy=strict-origin-when-cross-origin'
+        - 'caddy.header.X-Frame-Options=DENY'
+    healthcheck:
+      test:
+        - CMD
+        - wget
+        - --spider
+        - -q
+        - localhost:8080/api/v1/health
+      interval: 30s
+      timeout: 5s
+      retries: 3
+
+  otel-collector:
+    <<: *common
+    image: signoz/signoz-otel-collector:${OTELCOL_TAG:-v0.129.12}
+    command:
+      - --config=/etc/otel-collector-config.yaml
+      - --manager-config=/etc/manager-config.yaml
+      - --copy-path=/var/tmp/collector-config.yaml
+      - --feature-gates=-pkg.translator.prometheus.NormalizeName
+    configs:
+      - source: otel-collector-config
+        target: /etc/otel-collector-config.yaml
+      - source: otel-manager-config
+        target: /etc/manager-config.yaml
+    environment:
+      - OTEL_RESOURCE_ATTRIBUTES=host.name={{.Node.Hostname}},os.type={{.Node.Platform.OS}}
+      - LOW_CARDINAL_EXCEPTION_GROUPING=false
+    ports:
+      - '4317:4317'
+      - '4318:4318'
+    deploy:
+      <<: *deploy_base
+      replicas: 3
+
+networks:
+  fluxer-shared:
+    external: true
+
+volumes:
+  clickhouse:
+    driver: local
+  sqlite:
+    driver: local
+  zookeeper-1:
+    driver: local
+
+configs:
+  clickhouse-config:
+    file: ./conf/clickhouse/config.xml
+  clickhouse-users:
+    file: ./conf/clickhouse/users.xml
+  clickhouse-custom-function:
+    file: ./conf/clickhouse/custom-function.xml
+  clickhouse-cluster:
+    file: ./conf/clickhouse/cluster.xml
+  signoz-prometheus-config:
+    file: ./conf/signoz/prometheus.yml
+  otel-collector-config:
+    file: ./conf/signoz/otel-collector-config.yaml
+  otel-manager-config:
+    file: ./conf/signoz/otel-collector-opamp-config.yaml
--- a/fluxer_devops/signoz/conf/clickhouse/cluster.xml
+++ b/fluxer_devops/signoz/conf/clickhouse/cluster.xml
@@ -0,0 +1,75 @@
+<?xml version="1.0"?>
+<clickhouse>
+    <!-- ZooKeeper is used to store metadata about replicas, when using Replicated tables.
+         Optional. If you don't use replicated tables, you could omit that.
+
+         See https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/replication/
+      -->
+    <zookeeper>
+        <node index="1">
+            <host>zookeeper-1</host>
+            <port>2181</port>
+        </node>
+        <!-- <node index="2">
+            <host>zookeeper-2</host>
+            <port>2181</port>
+        </node>
+        <node index="3">
+            <host>zookeeper-3</host>
+            <port>2181</port>
+        </node> -->
+    </zookeeper>
+
+    <!-- Configuration of clusters that could be used in Distributed tables.
+         https://clickhouse.com/docs/en/operations/table_engines/distributed/
+      -->
+    <remote_servers>
+        <cluster>
+            <!-- Inter-server per-cluster secret for Distributed queries
+                 default: no secret (no authentication will be performed)
+
+                 If set, then Distributed queries will be validated on shards, so at least:
+                 - such cluster should exist on the shard,
+                 - such cluster should have the same secret.
+
+                 And also (and which is more important), the initial_user will
+                 be used as current user for the query.
+
+                 Right now the protocol is pretty simple and it only takes into account:
+                 - cluster name
+                 - query
+
+                 Also it will be nice if the following will be implemented:
+                 - source hostname (see interserver_http_host), but then it will depends from DNS,
+                   it can use IP address instead, but then the you need to get correct on the initiator node.
+                 - target hostname / ip address (same notes as for source hostname)
+                 - time-based security tokens
+            -->
+            <!-- <secret></secret> -->
+            <shard>
+                <!-- Optional. Whether to write data to just one of the replicas. Default: false (write data to all replicas). -->
+                <!-- <internal_replication>false</internal_replication> -->
+                <!-- Optional. Shard weight when writing data. Default: 1. -->
+                <!-- <weight>1</weight> -->
+                <replica>
+                    <host>clickhouse</host>
+                    <port>9000</port>
+                    <!-- Optional. Priority of the replica for load_balancing. Default: 1 (less value has more priority). -->
+                    <!-- <priority>1</priority> -->
+                </replica>
+            </shard>
+            <!-- <shard>
+                <replica>
+                    <host>clickhouse-2</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+            <shard>
+                <replica>
+                    <host>clickhouse-3</host>
+                    <port>9000</port>
+                </replica>
+            </shard> -->
+        </cluster>
+    </remote_servers>
+</clickhouse>
--- a/fluxer_devops/signoz/conf/clickhouse/config.d/keeper.xml
+++ b/fluxer_devops/signoz/conf/clickhouse/config.d/keeper.xml
@@ -0,0 +1,8 @@
+<clickhouse>
+    <keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>1</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
+        <snapshot_storage_path>/var/lib/clickhouse/coordination/snapshots</snapshot_storage_path>
+    </keeper_server>
+</clickhouse>
--- a/fluxer_devops/signoz/conf/clickhouse/config.xml
+++ b/fluxer_devops/signoz/conf/clickhouse/config.xml
--- a/fluxer_devops/signoz/conf/clickhouse/custom-function.xml
+++ b/fluxer_devops/signoz/conf/clickhouse/custom-function.xml
@@ -0,0 +1,21 @@
+<functions>
+    <function>
+        <type>executable</type>
+        <name>histogramQuantile</name>
+        <return_type>Float64</return_type>
+        <argument>
+            <type>Array(Float64)</type>
+            <name>buckets</name>
+        </argument>
+        <argument>
+            <type>Array(Float64)</type>
+            <name>counts</name>
+        </argument>
+        <argument>
+            <type>Float64</type>
+            <name>quantile</name>
+        </argument>
+        <format>CSV</format>
+        <command>./histogramQuantile</command>
+    </function>
+</functions>
--- a/fluxer_devops/signoz/conf/clickhouse/user_scripts/.gitkeep
+++ b/fluxer_devops/signoz/conf/clickhouse/user_scripts/.gitkeep
--- a/fluxer_devops/signoz/conf/clickhouse/users.xml
+++ b/fluxer_devops/signoz/conf/clickhouse/users.xml
@@ -0,0 +1,123 @@
+<?xml version="1.0"?>
+<clickhouse>
+    <!-- See also the files in users.d directory where the settings can be overridden. -->
+
+    <!-- Profiles of settings. -->
+    <profiles>
+        <!-- Default settings. -->
+        <default>
+            <!-- Maximum memory usage for processing single query, in bytes. -->
+            <max_memory_usage>10000000000</max_memory_usage>
+
+            <!-- How to choose between replicas during distributed query processing.
+                 random - choose random replica from set of replicas with minimum number of errors
+                 nearest_hostname - from set of replicas with minimum number of errors, choose replica
+                  with minimum number of different symbols between replica's hostname and local hostname
+                  (Hamming distance).
+                 in_order - first live replica is chosen in specified order.
+                 first_or_random - if first replica one has higher number of errors, pick a random one from replicas with minimum number of errors.
+            -->
+            <load_balancing>random</load_balancing>
+        </default>
+
+        <!-- Profile that allows only read queries. -->
+        <readonly>
+            <readonly>1</readonly>
+        </readonly>
+    </profiles>
+
+    <!-- Users and ACL. -->
+    <users>
+        <!-- If user name was not specified, 'default' user is used. -->
+        <default>
+            <!-- See also the files in users.d directory where the password can be overridden.
+
+                 Password could be specified in plaintext or in SHA256 (in hex format).
+
+                 If you want to specify password in plaintext (not recommended), place it in 'password' element.
+                 Example: <password>qwerty</password>.
+                 Password could be empty.
+
+                 If you want to specify SHA256, place it in 'password_sha256_hex' element.
+                 Example: <password_sha256_hex>65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5</password_sha256_hex>
+                 Restrictions of SHA256: impossibility to connect to ClickHouse using MySQL JS client (as of July 2019).
+
+                 If you want to specify double SHA1, place it in 'password_double_sha1_hex' element.
+                 Example: <password_double_sha1_hex>e395796d6546b1b65db9d665cd43f0e858dd4303</password_double_sha1_hex>
+
+                 If you want to specify a previously defined LDAP server (see 'ldap_servers' in the main config) for authentication,
+                  place its name in 'server' element inside 'ldap' element.
+                 Example: <ldap><server>my_ldap_server</server></ldap>
+
+                 If you want to authenticate the user via Kerberos (assuming Kerberos is enabled, see 'kerberos' in the main config),
+                  place 'kerberos' element instead of 'password' (and similar) elements.
+                 The name part of the canonical principal name of the initiator must match the user name for authentication to succeed.
+                 You can also place 'realm' element inside 'kerberos' element to further restrict authentication to only those requests
+                  whose initiator's realm matches it.
+                 Example: <kerberos />
+                 Example: <kerberos><realm>EXAMPLE.COM</realm></kerberos>
+
+                 How to generate decent password:
+                 Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-'
+                 In first line will be password and in second - corresponding SHA256.
+
+                 How to generate double SHA1:
+                 Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-'
+                 In first line will be password and in second - corresponding double SHA1.
+            -->
+            <password></password>
+
+            <!-- List of networks with open access.
+
+                 To open access from everywhere, specify:
+                    <ip>::/0</ip>
+
+                 To open access only from localhost, specify:
+                    <ip>::1</ip>
+                    <ip>127.0.0.1</ip>
+
+                 Each element of list has one of the following forms:
+                 <ip> IP-address or network mask. Examples: 213.180.204.3 or 10.0.0.1/8 or 10.0.0.1/255.255.255.0
+                     2a02:6b8::3 or 2a02:6b8::3/64 or 2a02:6b8::3/ffff:ffff:ffff:ffff::.
+                 <host> Hostname. Example: server01.clickhouse.com.
+                     To check access, DNS query is performed, and all received addresses compared to peer address.
+                 <host_regexp> Regular expression for host names. Example, ^server\d\d-\d\d-\d\.clickhouse\.com$
+                     To check access, DNS PTR query is performed for peer address and then regexp is applied.
+                     Then, for result of PTR query, another DNS query is performed and all received addresses compared to peer address.
+                     Strongly recommended that regexp is ends with $
+                 All results of DNS requests are cached till server restart.
+            -->
+            <networks>
+                <ip>::/0</ip>
+            </networks>
+
+            <!-- Settings profile for user. -->
+            <profile>default</profile>
+
+            <!-- Quota for user. -->
+            <quota>default</quota>
+
+            <!-- User can create other users and grant rights to them. -->
+            <!-- <access_management>1</access_management> -->
+        </default>
+    </users>
+
+    <!-- Quotas. -->
+    <quotas>
+        <!-- Name of quota. -->
+        <default>
+            <!-- Limits for time interval. You could specify many intervals with different limits. -->
+            <interval>
+                <!-- Length of interval. -->
+                <duration>3600</duration>
+
+                <!-- No limits. Just calculate resource usage for time interval. -->
+                <queries>0</queries>
+                <errors>0</errors>
+                <result_rows>0</result_rows>
+                <read_rows>0</read_rows>
+                <execution_time>0</execution_time>
+            </interval>
+        </default>
+    </quotas>
+</clickhouse>
--- a/fluxer_devops/signoz/conf/signoz/otel-collector-config.yaml
+++ b/fluxer_devops/signoz/conf/signoz/otel-collector-config.yaml
@@ -0,0 +1,109 @@
+connectors:
+  signozmeter:
+    metrics_flush_interval: 1h
+    dimensions:
+      - name: service.name
+      - name: deployment.environment
+      - name: host.name
+receivers:
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+      http:
+        endpoint: 0.0.0.0:4318
+  prometheus:
+    config:
+      global:
+        scrape_interval: 60s
+      scrape_configs:
+        - job_name: otel-collector
+          static_configs:
+            - targets:
+                - localhost:8888
+              labels:
+                job_name: otel-collector
+processors:
+  batch:
+    send_batch_size: 10000
+    send_batch_max_size: 11000
+    timeout: 10s
+  batch/meter:
+    send_batch_max_size: 25000
+    send_batch_size: 20000
+    timeout: 1s
+  resourcedetection:
+    detectors: [env, system]
+    timeout: 2s
+  signozspanmetrics/delta:
+    metrics_exporter: signozclickhousemetrics
+    metrics_flush_interval: 60s
+    latency_histogram_buckets:
+      [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s]
+    dimensions_cache_size: 100000
+    aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA
+    enable_exp_histogram: true
+    dimensions:
+      - name: service.namespace
+        default: default
+      - name: deployment.environment
+        default: default
+      - name: signoz.collector.id
+      - name: service.version
+      - name: browser.platform
+      - name: browser.mobile
+      - name: k8s.cluster.name
+      - name: k8s.node.name
+      - name: k8s.namespace.name
+      - name: host.name
+      - name: host.type
+      - name: container.name
+extensions:
+  health_check:
+    endpoint: 0.0.0.0:13133
+  pprof:
+    endpoint: 0.0.0.0:1777
+exporters:
+  clickhousetraces:
+    datasource: tcp://clickhouse:9000/signoz_traces
+    low_cardinal_exception_grouping: ${env:LOW_CARDINAL_EXCEPTION_GROUPING}
+    use_new_schema: true
+  signozclickhousemetrics:
+    dsn: tcp://clickhouse:9000/signoz_metrics
+  clickhouselogsexporter:
+    dsn: tcp://clickhouse:9000/signoz_logs
+    timeout: 10s
+    use_new_schema: true
+  signozclickhousemeter:
+    dsn: tcp://clickhouse:9000/signoz_meter
+    timeout: 45s
+    sending_queue:
+      enabled: false
+service:
+  telemetry:
+    logs:
+      encoding: json
+  extensions:
+    - health_check
+    - pprof
+  pipelines:
+    traces:
+      receivers: [otlp]
+      processors: [signozspanmetrics/delta, batch]
+      exporters: [clickhousetraces, signozmeter]
+    metrics:
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [signozclickhousemetrics, signozmeter]
+    metrics/prometheus:
+      receivers: [prometheus]
+      processors: [batch]
+      exporters: [signozclickhousemetrics, signozmeter]
+    logs:
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [clickhouselogsexporter, signozmeter]
+    metrics/meter:
+      receivers: [signozmeter]
+      processors: [batch/meter]
+      exporters: [signozclickhousemeter]
--- a/fluxer_devops/signoz/conf/signoz/otel-collector-opamp-config.yaml
+++ b/fluxer_devops/signoz/conf/signoz/otel-collector-opamp-config.yaml
@@ -0,0 +1 @@
+server_endpoint: ws://signoz:4320/v1/opamp
--- a/fluxer_devops/signoz/conf/signoz/prometheus.yml
+++ b/fluxer_devops/signoz/conf/signoz/prometheus.yml
@@ -0,0 +1,16 @@
+global:
+  scrape_interval: 5s
+  evaluation_interval: 15s
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+            - alertmanager:9093
+
+rule_files: []
+
+scrape_configs: []
+
+remote_read:
+  - url: tcp://clickhouse:9000/signoz_metrics
--- a/fluxer_devops/signoz/dashboards/.gitkeep
+++ b/fluxer_devops/signoz/dashboards/.gitkeep
--- a/fluxer_devops/signoz/deploy.sh
+++ b/fluxer_devops/signoz/deploy.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env sh
+
+# Copyright (C) 2026 Fluxer Contributors
+#
+# This file is part of Fluxer.
+#
+# Fluxer is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Fluxer is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with Fluxer. If not, see <https://www.gnu.org/licenses/>.
+
+set -eu
+
+SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
+STACK=${STACK:-fluxer-signoz}
+SIGNOZ_IMAGE_TAG=${SIGNOZ_IMAGE_TAG:-v0.105.1}
+
+if ! docker network inspect fluxer-shared >/dev/null 2>&1; then
+  docker network create -d overlay fluxer-shared
+fi
+
+if [ "$(docker info --format '{{.Swarm.LocalNodeState}}')" != "active" ]; then
+  echo "Docker swarm must be active for stack deployment. Run 'docker swarm init' and try again."
+  exit 1
+fi
+
+export STACK
+export SIGNOZ_IMAGE_TAG
+
+docker stack deploy --with-registry-auth -c "$SCRIPT_DIR/compose.yaml" "$STACK"