refactor progress

This commit is contained in:
Hampus Kraft
2026-02-17 12:22:36 +00:00
parent cb31608523
commit d5abd1a7e4
8257 changed files with 1190207 additions and 761040 deletions

View File

@@ -0,0 +1,407 @@
{
"name": "Fluxer Critical Alerts",
"description": "Critical alerts for Fluxer services",
"version": 2,
"alerts": [
{
"id": "high-api-error-rate",
"name": "High API Error Rate",
"type": "metric",
"condition": {
"query": "sum(rate(http_server_request_count{service_name='fluxer-api',http_response_status_code=~'5..'}[5m])) > 10",
"evaluation_interval": "1m",
"for": "5m"
},
"severity": "critical",
"annotations": {
"summary": "API error rate is above 10 req/s",
"description": "The fluxer-api service is experiencing a high error rate (5xx responses). This may indicate a service degradation or outage."
},
"labels": {
"service": "fluxer-api",
"alert_type": "error_rate"
},
"actions": [
{
"type": "notification",
"channel": "slack",
"target": "#alerts-critical"
}
]
},
{
"id": "high-api-latency",
"name": "High API Latency",
"type": "metric",
"condition": {
"query": "histogram_quantile(0.95, sum(rate(http_server_request_duration_bucket{service_name='fluxer-api'}[5m])) > 1000",
"evaluation_interval": "1m",
"for": "10m"
},
"severity": "warning",
"annotations": {
"summary": "API P95 latency is above 1000ms",
"description": "The fluxer-api service is experiencing high latency. 95% of requests are taking longer than 1 second."
},
"labels": {
"service": "fluxer-api",
"alert_type": "latency"
},
"actions": [
{
"type": "notification",
"channel": "slack",
"target": "#alerts-warning"
}
]
},
{
"id": "queue-depth-high",
"name": "Queue Depth Too High",
"type": "metric",
"condition": {
"query": "fluxer_queue_depth > 10000",
"evaluation_interval": "1m",
"for": "15m"
},
"severity": "warning",
"annotations": {
"summary": "Queue depth is above 10,000 jobs",
"description": "The job queue has accumulated more than 10,000 jobs. This may indicate processing is slower than job arrival."
},
"labels": {
"service": "fluxer-queue",
"alert_type": "queue_depth"
},
"actions": [
{
"type": "notification",
"channel": "slack",
"target": "#alerts-warning"
}
]
},
{
"id": "queue-dlq-rate",
"name": "High Dead Letter Queue Rate",
"type": "metric",
"condition": {
"query": "sum(rate(fluxer_queue_dead_letter[5m])) > 5",
"evaluation_interval": "1m",
"for": "10m"
},
"severity": "critical",
"annotations": {
"summary": "DLQ rate is above 5 jobs/sec",
"description": "Jobs are being moved to the dead letter queue at a high rate. This may indicate persistent job failures."
},
"labels": {
"service": "fluxer-queue",
"alert_type": "dlq_rate"
},
"actions": [
{
"type": "notification",
"channel": "slack",
"target": "#alerts-critical"
}
]
},
{
"id": "gateway-connection-drop",
"name": "Gateway Connection Drop Rate",
"type": "metric",
"condition": {
"query": "rate(gateway_websocket_disconnections[1m]) / rate(gateway_websocket_connections[1m]) > 0.5",
"evaluation_interval": "1m",
"for": "5m"
},
"severity": "critical",
"annotations": {
"summary": "Gateway disconnect rate exceeds 50% of connect rate",
"description": "WebSocket connections are dropping at an unusually high rate. This may indicate network issues or service instability."
},
"labels": {
"service": "fluxer-gateway",
"alert_type": "connection_stability"
},
"actions": [
{
"type": "notification",
"channel": "slack",
"target": "#alerts-critical"
}
]
},
{
"id": "gateway-rpc-latency-high",
"name": "Gateway RPC Latency High",
"type": "metric",
"condition": {
"query": "gateway_rpc_latency_p95 > 500",
"evaluation_interval": "1m",
"for": "10m"
},
"severity": "warning",
"annotations": {
"summary": "Gateway RPC P95 latency above 500ms",
"description": "RPC calls from gateway to backend are experiencing high latency."
},
"labels": {
"service": "fluxer-gateway",
"alert_type": "latency"
},
"actions": [
{
"type": "notification",
"channel": "slack",
"target": "#alerts-warning"
}
]
},
{
"id": "media-proxy-error-rate",
"name": "Media Proxy High Error Rate",
"type": "metric",
"condition": {
"query": "sum(rate(media_proxy_failure{service_name='fluxer-media-proxy'}[5m])) / sum(rate(http_server_request_count{service_name='fluxer-media-proxy'}[5m])) > 0.1",
"evaluation_interval": "1m",
"for": "10m"
},
"severity": "warning",
"annotations": {
"summary": "Media proxy error rate above 10%",
"description": "The media proxy is failing more than 10% of requests. This may indicate origin issues or cache problems."
},
"labels": {
"service": "fluxer-media-proxy",
"alert_type": "error_rate"
},
"actions": [
{
"type": "notification",
"channel": "slack",
"target": "#alerts-warning"
}
]
},
{
"id": "media-proxy-timeout-rate",
"name": "Media Proxy High Timeout Rate",
"type": "metric",
"condition": {
"query": "sum(rate(media_proxy_failure{error_type='timeout'}[5m])) > 5",
"evaluation_interval": "1m",
"for": "5m"
},
"severity": "warning",
"annotations": {
"summary": "Media proxy timeout rate above 5 req/s",
"description": "The media proxy is experiencing a high rate of timeouts. This may indicate network issues or slow origin servers."
},
"labels": {
"service": "fluxer-media-proxy",
"alert_type": "timeout"
},
"actions": [
{
"type": "notification",
"channel": "slack",
"target": "#alerts-warning"
}
]
},
{
"id": "telemetry-ingestion-stopped",
"name": "Telemetry Ingestion Stopped",
"type": "metric",
"condition": {
"query": "increase(signoz_traces_signoz_index_v2[15m]) == 0",
"evaluation_interval": "1m",
"for": "5m"
},
"severity": "critical",
"annotations": {
"summary": "No traces being ingested",
"description": "The SigNoz collector has not received any traces in the last 15 minutes. This may indicate a collector issue or service instrumentation failure."
},
"labels": {
"service": "signoz",
"alert_type": "telemetry"
},
"actions": [
{
"type": "notification",
"channel": "slack",
"target": "#alerts-critical"
}
]
},
{
"id": "cron-job-overdue",
"name": "Cron Job Overdue",
"type": "metric",
"condition": {
"query": "time() - max(fluxer_queue_cron_tick_timestamp by (cron)) > 3600",
"evaluation_interval": "5m",
"for": "5m"
},
"severity": "warning",
"annotations": {
"summary": "Cron job has not executed in over 1 hour",
"description": "A scheduled cron job has not run in over an hour. This may indicate a hung cron process or scheduling issue."
},
"labels": {
"service": "fluxer-queue",
"alert_type": "cron"
},
"actions": [
{
"type": "notification",
"channel": "slack",
"target": "#alerts-warning"
}
]
},
{
"id": "csam-match-detected",
"name": "CSAM Match Detected",
"type": "metric",
"condition": {
"query": "sum(rate(fluxer_csam_matches_total{service_name='fluxer-api'}[1m])) > 0",
"evaluation_interval": "1m",
"for": "0m"
},
"severity": "critical",
"annotations": {
"summary": "CSAM content has been detected",
"description": "CSAM content has been detected. Immediate review required."
},
"labels": {
"service": "fluxer-api",
"alert_type": "csam_match"
},
"actions": [
{
"type": "notification",
"channel": "slack",
"target": "#alerts-critical"
}
]
},
{
"id": "photodna-api-error-rate-high",
"name": "PhotoDNA API Error Rate High",
"type": "metric",
"condition": {
"query": "sum(rate(fluxer_csam_photodna_api_total{service_name='fluxer-api',status='error'}[5m])) / sum(rate(fluxer_csam_photodna_api_total{service_name='fluxer-api'}[5m])) > 0.1",
"evaluation_interval": "1m",
"for": "5m"
},
"severity": "warning",
"annotations": {
"summary": "PhotoDNA API error rate exceeds 10%",
"description": "PhotoDNA API error rate exceeds 10%"
},
"labels": {
"service": "fluxer-api",
"alert_type": "photodna_error_rate"
},
"actions": [
{
"type": "notification",
"channel": "slack",
"target": "#alerts-warning"
}
]
},
{
"id": "ncmec-submission-failure",
"name": "NCMEC Submission Failure",
"type": "metric",
"condition": {
"query": "sum(rate(fluxer_csam_ncmec_submissions{service_name='fluxer-api',status='error'}[5m])) > 0",
"evaluation_interval": "1m",
"for": "5m"
},
"severity": "critical",
"annotations": {
"summary": "NCMEC report submission has failed",
"description": "NCMEC report submission has failed. Manual intervention required."
},
"labels": {
"service": "fluxer-api",
"alert_type": "ncmec_submission"
},
"actions": [
{
"type": "notification",
"channel": "slack",
"target": "#alerts-critical"
}
]
},
{
"id": "csam-scan-failure-rate-high",
"name": "CSAM Scan Failure Rate High",
"type": "metric",
"condition": {
"query": "sum(rate(fluxer_csam_scans_total{service_name='fluxer-api',status='error'}[5m])) / sum(rate(fluxer_csam_scans_total{service_name='fluxer-api'}[5m])) > 0.05",
"evaluation_interval": "1m",
"for": "5m"
},
"severity": "warning",
"annotations": {
"summary": "CSAM scan failure rate exceeds 5%",
"description": "CSAM scan failure rate exceeds 5%"
},
"labels": {
"service": "fluxer-api",
"alert_type": "csam_scan_failure_rate"
},
"actions": [
{
"type": "notification",
"channel": "slack",
"target": "#alerts-warning"
}
]
},
{
"id": "photodna-api-latency-high",
"name": "PhotoDNA API Latency High",
"type": "metric",
"condition": {
"query": "histogram_quantile(0.95, sum(rate(fluxer_csam_photodna_api_duration_ms_bucket{service_name='fluxer-api'}[5m])) by (le)) > 5000",
"evaluation_interval": "1m",
"for": "5m"
},
"severity": "warning",
"annotations": {
"summary": "PhotoDNA API p95 latency exceeds 5 seconds",
"description": "PhotoDNA API p95 latency exceeds 5 seconds"
},
"labels": {
"service": "fluxer-api",
"alert_type": "photodna_latency"
},
"actions": [
{
"type": "notification",
"channel": "slack",
"target": "#alerts-warning"
}
]
}
],
"notification_channels": {
"slack": {
"type": "webhook",
"url": "${ALERT_WEBHOOK_URL}",
"channel_mapping": {
"critical": "#alerts-critical",
"warning": "#alerts-warning"
}
}
}
}

View File

@@ -0,0 +1,329 @@
groups:
- name: fluxer_api_alerts
interval: 30s
rules:
- alert: FluxerHighErrorRate
expr: |
(
rate(http_server_request_count{http_response_status_code=~"5.."}[5m])
/
rate(http_server_request_count[5m])
) > 0.05
and rate(http_server_request_count[5m]) > 10
for: 5m
labels:
severity: critical
service: fluxer-api
alert_type: error_rate
annotations:
summary: 'High error rate on {{ $labels.service_name }}'
description: 'Error rate is above 5% (minimum 10 requests/5m) on {{ $labels.service_name }}. Current value: {{ $value | humanizePercentage }}'
runbook: 'https://docs.fluxer.dev/runbooks/high-error-rate'
- alert: FluxerElevatedErrorRate
expr: |
(
rate(http_server_request_count{http_response_status_code=~"5.."}[5m])
/
rate(http_server_request_count[5m])
) > 0.01
and rate(http_server_request_count[5m]) > 10
for: 10m
labels:
severity: warning
service: fluxer-api
alert_type: error_rate
annotations:
summary: 'Elevated error rate on {{ $labels.service_name }}'
description: 'Error rate is above 1% on {{ $labels.service_name }}. Current value: {{ $value | humanizePercentage }}'
runbook: 'https://docs.fluxer.dev/runbooks/high-error-rate'
- name: fluxer_queue_alerts
interval: 30s
rules:
- alert: FluxerQueueDepthCritical
expr: |
fluxer_queue_depth{service_name="fluxer-queue"} > 10000
for: 5m
labels:
severity: critical
service: fluxer-queue
alert_type: queue_depth
annotations:
summary: 'Queue depth critically high for {{ $labels.queue_name }}'
description: 'Queue {{ $labels.queue_name }} has {{ $value }} jobs pending (threshold: 10,000). Jobs may be delayed or processing is stalled.'
runbook: 'https://docs.fluxer.dev/runbooks/queue-depth-critical'
- alert: FluxerQueueDepthElevated
expr: |
fluxer_queue_depth{service_name="fluxer-queue"} > 5000
for: 10m
labels:
severity: warning
service: fluxer-queue
alert_type: queue_depth
annotations:
summary: 'Queue depth elevated for {{ $labels.queue_name }}'
description: 'Queue {{ $labels.queue_name }} has {{ $value }} jobs pending (threshold: 5,000). Monitor for escalation.'
- alert: FluxerDLQRateCritical
expr: |
sum(rate(fluxer_queue_dead_letter{service_name="fluxer-queue"}[5m])) > 5
for: 5m
labels:
severity: critical
service: fluxer-queue
alert_type: dlq_rate
annotations:
summary: 'High dead letter queue rate'
description: 'Jobs are failing and moving to DLQ at rate {{ $value | humanize }} jobs/sec. Check job failures and error logs.'
runbook: 'https://docs.fluxer.dev/runbooks/high-dlq-rate'
- name: fluxer_gateway_alerts
interval: 30s
rules:
- alert: FluxerGatewayConnectionDropCritical
expr: |
sum(rate(gateway_websocket_disconnections{reason="error"}[1m])) by (service_name) > 10
for: 3m
labels:
severity: critical
service: fluxer-gateway
alert_type: connection_drop
annotations:
summary: 'Critical WebSocket error disconnect rate'
description: 'Gateway experiencing {{ $value | humanize }} error disconnects/min. This may indicate service instability or network issues.'
runbook: 'https://docs.fluxer.dev/runbooks/gateway-connection-drop'
- alert: FluxerGatewayDisconnectElevated
expr: |
sum(rate(gateway_websocket_disconnections{reason="error"}[1m])) by (service_name) > 5
for: 5m
labels:
severity: warning
service: fluxer-gateway
alert_type: connection_drop
annotations:
summary: 'Elevated WebSocket error disconnect rate'
description: 'Gateway experiencing {{ $value | humanize }} error disconnects/min. Monitor for escalation.'
- alert: FluxerGatewayDisconnectRatioHigh
expr: |
(
sum(rate(gateway_websocket_disconnections{reason="error"}[5m])) by (service_name)
/
sum(rate(gateway_websocket_connections[5m])) by (service_name)
) > 0.1
for: 5m
labels:
severity: critical
service: fluxer-gateway
alert_type: disconnect_ratio
annotations:
summary: 'Gateway disconnect ratio above 10%'
description: 'Error disconnects represent {{ $value | humanizePercentage }} of new connections. Check gateway stability.'
runbook: 'https://docs.fluxer.dev/runbooks/gateway-connection-drop'
- alert: FluxerGatewayRPCLatencyHigh
expr: |
histogram_quantile(0.95,
sum(rate(gateway_rpc_latency_bucket{service_name="fluxer-gateway"}[5m])) by (le)
) > 500
for: 10m
labels:
severity: warning
service: fluxer-gateway
alert_type: rpc_latency
annotations:
summary: 'Gateway RPC P95 latency above 500ms'
description: 'Gateway RPC calls experiencing high latency. Current P95: {{ $value | humanize }}ms'
runbook: 'https://docs.fluxer.dev/runbooks/gateway-rpc-latency'
- name: fluxer_log_alerts
interval: 30s
rules:
- alert: FluxerLogErrorSpikeCritical
expr: |
sum(rate(logs_count{severity_text="ERROR"}[5m])) by (service_name) > 50
for: 2m
labels:
severity: critical
alert_type: log_error_spike
annotations:
summary: 'Critical error log volume spike on {{ $labels.service_name }}'
description: 'Service {{ $labels.service_name }} logging {{ $value | humanize }} errors/sec. Check logs and traces for root cause.'
runbook: 'https://docs.fluxer.dev/runbooks/log-error-spike'
- alert: FluxerLogErrorElevated
expr: |
sum(rate(logs_count{severity_text="ERROR"}[5m])) by (service_name) > 20
for: 10m
labels:
severity: warning
alert_type: log_error_elevated
annotations:
summary: 'Elevated error log volume on {{ $labels.service_name }}'
description: 'Service {{ $labels.service_name }} logging {{ $value | humanize }} errors/sec. Monitor for escalation.'
- alert: FluxerLogWarningElevated
expr: |
sum(rate(logs_count{severity_text="WARN"}[5m])) by (service_name) > 100
for: 10m
labels:
severity: warning
alert_type: log_warning_elevated
annotations:
summary: 'Elevated warning log volume on {{ $labels.service_name }}'
description: 'Service {{ $labels.service_name }} logging {{ $value | humanize }} warnings/sec. Review warning patterns.'
- name: fluxer_api_performance_alerts
interval: 30s
rules:
- alert: FluxerAPILatencyCritical
expr: |
histogram_quantile(0.95,
sum(rate(http_server_request_duration_bucket{service_name="fluxer-api"}[5m])) by (le, http_route)
) > 2000
for: 5m
labels:
severity: critical
service: fluxer-api
alert_type: latency
annotations:
summary: 'Critical API latency on route {{ $labels.http_route }}'
description: 'P95 latency for route {{ $labels.http_route }} is above 2 seconds. Current: {{ $value | humanize }}ms'
runbook: 'https://docs.fluxer.dev/runbooks/high-api-latency'
- alert: FluxerAPILatencyElevated
expr: |
histogram_quantile(0.95,
sum(rate(http_server_request_duration_bucket{service_name="fluxer-api"}[5m])) by (le, http_route)
) > 1000
for: 10m
labels:
severity: warning
service: fluxer-api
alert_type: latency
annotations:
summary: 'Elevated API latency on route {{ $labels.http_route }}'
description: 'P95 latency for route {{ $labels.http_route }} is above 1 second. Current: {{ $value | humanize }}ms'
- name: fluxer_database_alerts
interval: 30s
rules:
- alert: FluxerDBLatencyCritical
expr: |
histogram_quantile(0.95,
sum(rate(db_query_latency_bucket[5m])) by (le, query_type)
) > 1000
for: 5m
labels:
severity: critical
alert_type: database_latency
annotations:
summary: 'Critical database query latency for {{ $labels.query_type }}'
description: 'P95 {{ $labels.query_type }} query latency above 1 second. Current: {{ $value | humanize }}ms'
runbook: 'https://docs.fluxer.dev/runbooks/database-latency'
- alert: FluxerDBConnectionPoolHigh
expr: |
db_connection_pool_active / db_connection_pool_max > 0.8
for: 10m
labels:
severity: warning
alert_type: connection_pool
annotations:
summary: 'Database connection pool usage above 80%'
description: 'Connection pool at {{ $value | humanizePercentage }} capacity. May lead to connection waits.'
runbook: 'https://docs.fluxer.dev/runbooks/connection-pool'
- name: fluxer_cache_alerts
interval: 30s
rules:
- alert: FluxerCacheHitRateLow
expr: |
sum(rate(cache_operation{status="hit"}[5m])) by (cache_name)
/
sum(rate(cache_operation{status=~"hit|miss"}[5m])) by (cache_name) < 0.5
for: 15m
labels:
severity: warning
alert_type: cache_efficiency
annotations:
summary: 'Low cache hit rate for {{ $labels.cache_name }}'
description: 'Cache {{ $labels.cache_name }} hit rate below 50%. Current: {{ $value | humanizePercentage }}'
runbook: 'https://docs.fluxer.dev/runbooks/low-cache-hit-rate'
- name: fluxer_worker_alerts
interval: 30s
rules:
- alert: FluxerWorkerFailureRateCritical
expr: |
sum(rate(fluxer_worker_task_failure[5m])) by (task_name) > 1
for: 5m
labels:
severity: critical
alert_type: worker_failure
annotations:
summary: 'Critical worker task failure rate for {{ $labels.task_name }}'
description: 'Worker task {{ $labels.task_name }} failing at {{ $value | humanize }} tasks/sec. Check task logs.'
runbook: 'https://docs.fluxer.dev/runbooks/worker-failures'
- alert: FluxerCronJobOverdue
expr: |
time() - max(fluxer_queue_cron_tick_timestamp by (cron)) > 3600
for: 5m
labels:
severity: warning
service: fluxer-queue
alert_type: cron
annotations:
summary: 'Cron job {{ $labels.cron }} has not executed in over 1 hour'
description: "Scheduled cron job hasn't run since {{ $value | humanizeTimestamp }}. May indicate hung process."
runbook: 'https://docs.fluxer.dev/runbooks/cron-overdue'
- name: fluxer_telemetry_alerts
interval: 60s
rules:
- alert: FluxerTelemetryIngestionStopped
expr: |
increase(signoz_traces_signoz_index_v2[15m]) == 0
for: 5m
labels:
severity: critical
alert_type: telemetry
annotations:
summary: 'No traces being ingested'
description: "SigNoz collector hasn't received traces in 15 minutes. Check collector health and service instrumentation."
runbook: 'https://docs.fluxer.dev/runbooks/telemetry-down'
- name: fluxer_media_proxy_alerts
interval: 30s
rules:
- alert: FluxerMediaProxyErrorRate
expr: |
sum(rate(media_proxy_failure{service_name="fluxer-media-proxy"}[5m]))
/
sum(rate(http_server_request_count{service_name="fluxer-media-proxy"}[5m])) > 0.1
for: 10m
labels:
severity: warning
service: fluxer-media-proxy
alert_type: error_rate
annotations:
summary: 'Media proxy error rate above 10%'
description: 'Media proxy failing {{ $value | humanizePercentage }} of requests. Check origin servers and cache.'
runbook: 'https://docs.fluxer.dev/runbooks/media-proxy-errors'
- alert: FluxerMediaProxyTimeoutRate
expr: |
sum(rate(media_proxy_failure{error_type="timeout"}[5m])) > 5
for: 5m
labels:
severity: warning
service: fluxer-media-proxy
alert_type: timeout
annotations:
summary: 'Media proxy timeout rate above 5 req/s'
description: 'Media proxy experiencing high timeout rate. May indicate network issues or slow origins.'
runbook: 'https://docs.fluxer.dev/runbooks/media-proxy-timeouts'

View File

@@ -0,0 +1,213 @@
x-common: &common
networks:
- fluxer-shared
logging:
options:
max-size: 50m
max-file: '3'
x-deploy-base: &deploy_base
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
x-clickhouse-defaults: &clickhouse_defaults
<<: *common
image: clickhouse/clickhouse-server:25.5.6
tty: true
environment:
- CLICKHOUSE_SKIP_USER_SETUP=1
deploy:
<<: *deploy_base
labels:
signoz.io/scrape: 'true'
signoz.io/port: '9363'
signoz.io/path: '/metrics'
healthcheck:
test:
- CMD
- wget
- --spider
- -q
- 0.0.0.0:8123/ping
interval: 30s
timeout: 5s
retries: 3
ulimits:
nproc: 65535
nofile:
soft: 262144
hard: 262144
x-zookeeper-defaults: &zookeeper_defaults
<<: *common
image: signoz/zookeeper:3.7.1
user: root
deploy:
<<: *deploy_base
labels:
signoz.io/scrape: 'true'
signoz.io/port: '9141'
signoz.io/path: '/metrics'
healthcheck:
test:
- CMD-SHELL
- curl -s -m 2 http://localhost:8080/commands/ruok | grep error | grep null
interval: 30s
timeout: 5s
retries: 3
services:
init-clickhouse:
<<: *common
image: clickhouse/clickhouse-server:25.5.6
command:
- bash
- -c
- |
version="v0.0.1"
node_os=$$(uname -s | tr '[:upper:]' '[:lower:]')
node_arch=$$(uname -m | sed s/aarch64/arm64/ | sed s/x86_64/amd64/)
echo "Fetching histogram-binary for $${node_os}/$${node_arch}"
cd /tmp
wget -O histogram-quantile.tar.gz "https://github.com/SigNoz/signoz/releases/download/histogram-quantile%2F$${version}/histogram-quantile_$${node_os}_$${node_arch}.tar.gz"
tar -xvzf histogram-quantile.tar.gz
mv histogram-quantile /var/lib/clickhouse/user_scripts/histogramQuantile
deploy:
restart_policy:
condition: on-failure
volumes:
- ./conf/clickhouse/user_scripts:/var/lib/clickhouse/user_scripts:rw
zookeeper-1:
<<: *zookeeper_defaults
environment:
- ZOO_SERVER_ID=1
- ALLOW_ANONYMOUS_LOGIN=yes
- ZOO_AUTOPURGE_INTERVAL=1
- ZOO_ENABLE_PROMETHEUS_METRICS=yes
- ZOO_PROMETHEUS_METRICS_PORT_NUMBER=9141
volumes:
- zookeeper-1:/bitnami/zookeeper
clickhouse:
<<: *clickhouse_defaults
hostname: clickhouse
configs:
- source: clickhouse-config
target: /etc/clickhouse-server/config.xml
- source: clickhouse-users
target: /etc/clickhouse-server/users.xml
- source: clickhouse-custom-function
target: /etc/clickhouse-server/custom-function.xml
- source: clickhouse-cluster
target: /etc/clickhouse-server/config.d/cluster.xml
volumes:
- clickhouse:/var/lib/clickhouse/
schema-migrator:
<<: *common
image: signoz/signoz-schema-migrator:${OTELCOL_TAG:-v0.129.12}
entrypoint: sh
command:
- -c
- /signoz-schema-migrator sync --dsn=tcp://clickhouse:9000 --up= && /signoz-schema-migrator async --dsn=tcp://clickhouse:9000 --up=
deploy:
restart_policy:
condition: on-failure
delay: 5s
signoz:
<<: *common
image: signoz/signoz:${SIGNOZ_IMAGE_TAG:-v0.108.0}
command:
- --config=/root/config/prometheus.yml
environment:
- SIGNOZ_ALERTMANAGER_PROVIDER=signoz
- SIGNOZ_TELEMETRYSTORE_CLICKHOUSE_DSN=tcp://clickhouse:9000
- SIGNOZ_SQLSTORE_SQLITE_PATH=/var/lib/signoz/signoz.db
- DASHBOARDS_PATH=/root/config/dashboards
- STORAGE=clickhouse
- GODEBUG=netdns=go
- TELEMETRY_ENABLED=true
- DEPLOYMENT_TYPE=docker-swarm
- DOT_METRICS_ENABLED=true
configs:
- source: signoz-prometheus-config
target: /root/config/prometheus.yml
volumes:
- sqlite:/var/lib/signoz/
- ./dashboards:/root/config/dashboards:ro
deploy:
<<: *deploy_base
replicas: 1
labels:
- 'caddy=signoz.fluxer.app'
- 'caddy.reverse_proxy={{upstreams 8080}}'
- 'caddy.header.Strict-Transport-Security="max-age=31536000; includeSubDomains; preload"'
- 'caddy.header.X-Xss-Protection="1; mode=block"'
- 'caddy.header.X-Content-Type-Options=nosniff'
- 'caddy.header.Referrer-Policy=strict-origin-when-cross-origin'
- 'caddy.header.X-Frame-Options=DENY'
healthcheck:
test:
- CMD
- wget
- --spider
- -q
- localhost:8080/api/v1/health
interval: 30s
timeout: 5s
retries: 3
otel-collector:
<<: *common
image: signoz/signoz-otel-collector:${OTELCOL_TAG:-v0.129.12}
command:
- --config=/etc/otel-collector-config.yaml
- --manager-config=/etc/manager-config.yaml
- --copy-path=/var/tmp/collector-config.yaml
- --feature-gates=-pkg.translator.prometheus.NormalizeName
configs:
- source: otel-collector-config
target: /etc/otel-collector-config.yaml
- source: otel-manager-config
target: /etc/manager-config.yaml
environment:
- OTEL_RESOURCE_ATTRIBUTES=host.name={{.Node.Hostname}},os.type={{.Node.Platform.OS}}
- LOW_CARDINAL_EXCEPTION_GROUPING=false
ports:
- '4317:4317'
- '4318:4318'
deploy:
<<: *deploy_base
replicas: 3
networks:
fluxer-shared:
external: true
volumes:
clickhouse:
driver: local
sqlite:
driver: local
zookeeper-1:
driver: local
configs:
clickhouse-config:
file: ./conf/clickhouse/config.xml
clickhouse-users:
file: ./conf/clickhouse/users.xml
clickhouse-custom-function:
file: ./conf/clickhouse/custom-function.xml
clickhouse-cluster:
file: ./conf/clickhouse/cluster.xml
signoz-prometheus-config:
file: ./conf/signoz/prometheus.yml
otel-collector-config:
file: ./conf/signoz/otel-collector-config.yaml
otel-manager-config:
file: ./conf/signoz/otel-collector-opamp-config.yaml

View File

@@ -0,0 +1,75 @@
<?xml version="1.0"?>
<clickhouse>
<!-- ZooKeeper is used to store metadata about replicas, when using Replicated tables.
Optional. If you don't use replicated tables, you could omit that.
See https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/replication/
-->
<zookeeper>
<node index="1">
<host>zookeeper-1</host>
<port>2181</port>
</node>
<!-- <node index="2">
<host>zookeeper-2</host>
<port>2181</port>
</node>
<node index="3">
<host>zookeeper-3</host>
<port>2181</port>
</node> -->
</zookeeper>
<!-- Configuration of clusters that could be used in Distributed tables.
https://clickhouse.com/docs/en/operations/table_engines/distributed/
-->
<remote_servers>
<cluster>
<!-- Inter-server per-cluster secret for Distributed queries
default: no secret (no authentication will be performed)
If set, then Distributed queries will be validated on shards, so at least:
- such cluster should exist on the shard,
- such cluster should have the same secret.
And also (and which is more important), the initial_user will
be used as current user for the query.
Right now the protocol is pretty simple and it only takes into account:
- cluster name
- query
Also it will be nice if the following will be implemented:
- source hostname (see interserver_http_host), but then it will depends from DNS,
it can use IP address instead, but then the you need to get correct on the initiator node.
- target hostname / ip address (same notes as for source hostname)
- time-based security tokens
-->
<!-- <secret></secret> -->
<shard>
<!-- Optional. Whether to write data to just one of the replicas. Default: false (write data to all replicas). -->
<!-- <internal_replication>false</internal_replication> -->
<!-- Optional. Shard weight when writing data. Default: 1. -->
<!-- <weight>1</weight> -->
<replica>
<host>clickhouse</host>
<port>9000</port>
<!-- Optional. Priority of the replica for load_balancing. Default: 1 (less value has more priority). -->
<!-- <priority>1</priority> -->
</replica>
</shard>
<!-- <shard>
<replica>
<host>clickhouse-2</host>
<port>9000</port>
</replica>
</shard>
<shard>
<replica>
<host>clickhouse-3</host>
<port>9000</port>
</replica>
</shard> -->
</cluster>
</remote_servers>
</clickhouse>

View File

@@ -0,0 +1,8 @@
<clickhouse>
<keeper_server>
<tcp_port>9181</tcp_port>
<server_id>1</server_id>
<log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
<snapshot_storage_path>/var/lib/clickhouse/coordination/snapshots</snapshot_storage_path>
</keeper_server>
</clickhouse>

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,21 @@
<functions>
<function>
<type>executable</type>
<name>histogramQuantile</name>
<return_type>Float64</return_type>
<argument>
<type>Array(Float64)</type>
<name>buckets</name>
</argument>
<argument>
<type>Array(Float64)</type>
<name>counts</name>
</argument>
<argument>
<type>Float64</type>
<name>quantile</name>
</argument>
<format>CSV</format>
<command>./histogramQuantile</command>
</function>
</functions>

View File

@@ -0,0 +1,123 @@
<?xml version="1.0"?>
<clickhouse>
<!-- See also the files in users.d directory where the settings can be overridden. -->
<!-- Profiles of settings. -->
<profiles>
<!-- Default settings. -->
<default>
<!-- Maximum memory usage for processing single query, in bytes. -->
<max_memory_usage>10000000000</max_memory_usage>
<!-- How to choose between replicas during distributed query processing.
random - choose random replica from set of replicas with minimum number of errors
nearest_hostname - from set of replicas with minimum number of errors, choose replica
with minimum number of different symbols between replica's hostname and local hostname
(Hamming distance).
in_order - first live replica is chosen in specified order.
first_or_random - if first replica one has higher number of errors, pick a random one from replicas with minimum number of errors.
-->
<load_balancing>random</load_balancing>
</default>
<!-- Profile that allows only read queries. -->
<readonly>
<readonly>1</readonly>
</readonly>
</profiles>
<!-- Users and ACL. -->
<users>
<!-- If user name was not specified, 'default' user is used. -->
<default>
<!-- See also the files in users.d directory where the password can be overridden.
Password could be specified in plaintext or in SHA256 (in hex format).
If you want to specify password in plaintext (not recommended), place it in 'password' element.
Example: <password>qwerty</password>.
Password could be empty.
If you want to specify SHA256, place it in 'password_sha256_hex' element.
Example: <password_sha256_hex>65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5</password_sha256_hex>
Restrictions of SHA256: impossibility to connect to ClickHouse using MySQL JS client (as of July 2019).
If you want to specify double SHA1, place it in 'password_double_sha1_hex' element.
Example: <password_double_sha1_hex>e395796d6546b1b65db9d665cd43f0e858dd4303</password_double_sha1_hex>
If you want to specify a previously defined LDAP server (see 'ldap_servers' in the main config) for authentication,
place its name in 'server' element inside 'ldap' element.
Example: <ldap><server>my_ldap_server</server></ldap>
If you want to authenticate the user via Kerberos (assuming Kerberos is enabled, see 'kerberos' in the main config),
place 'kerberos' element instead of 'password' (and similar) elements.
The name part of the canonical principal name of the initiator must match the user name for authentication to succeed.
You can also place 'realm' element inside 'kerberos' element to further restrict authentication to only those requests
whose initiator's realm matches it.
Example: <kerberos />
Example: <kerberos><realm>EXAMPLE.COM</realm></kerberos>
How to generate decent password:
Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-'
In first line will be password and in second - corresponding SHA256.
How to generate double SHA1:
Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-'
In first line will be password and in second - corresponding double SHA1.
-->
<password></password>
<!-- List of networks with open access.
To open access from everywhere, specify:
<ip>::/0</ip>
To open access only from localhost, specify:
<ip>::1</ip>
<ip>127.0.0.1</ip>
Each element of list has one of the following forms:
<ip> IP-address or network mask. Examples: 213.180.204.3 or 10.0.0.1/8 or 10.0.0.1/255.255.255.0
2a02:6b8::3 or 2a02:6b8::3/64 or 2a02:6b8::3/ffff:ffff:ffff:ffff::.
<host> Hostname. Example: server01.clickhouse.com.
To check access, DNS query is performed, and all received addresses compared to peer address.
<host_regexp> Regular expression for host names. Example, ^server\d\d-\d\d-\d\.clickhouse\.com$
To check access, DNS PTR query is performed for peer address and then regexp is applied.
Then, for result of PTR query, another DNS query is performed and all received addresses compared to peer address.
Strongly recommended that regexp is ends with $
All results of DNS requests are cached till server restart.
-->
<networks>
<ip>::/0</ip>
</networks>
<!-- Settings profile for user. -->
<profile>default</profile>
<!-- Quota for user. -->
<quota>default</quota>
<!-- User can create other users and grant rights to them. -->
<!-- <access_management>1</access_management> -->
</default>
</users>
<!-- Quotas. -->
<quotas>
<!-- Name of quota. -->
<default>
<!-- Limits for time interval. You could specify many intervals with different limits. -->
<interval>
<!-- Length of interval. -->
<duration>3600</duration>
<!-- No limits. Just calculate resource usage for time interval. -->
<queries>0</queries>
<errors>0</errors>
<result_rows>0</result_rows>
<read_rows>0</read_rows>
<execution_time>0</execution_time>
</interval>
</default>
</quotas>
</clickhouse>

View File

@@ -0,0 +1,109 @@
connectors:
signozmeter:
metrics_flush_interval: 1h
dimensions:
- name: service.name
- name: deployment.environment
- name: host.name
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
prometheus:
config:
global:
scrape_interval: 60s
scrape_configs:
- job_name: otel-collector
static_configs:
- targets:
- localhost:8888
labels:
job_name: otel-collector
processors:
batch:
send_batch_size: 10000
send_batch_max_size: 11000
timeout: 10s
batch/meter:
send_batch_max_size: 25000
send_batch_size: 20000
timeout: 1s
resourcedetection:
detectors: [env, system]
timeout: 2s
signozspanmetrics/delta:
metrics_exporter: signozclickhousemetrics
metrics_flush_interval: 60s
latency_histogram_buckets:
[100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s]
dimensions_cache_size: 100000
aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA
enable_exp_histogram: true
dimensions:
- name: service.namespace
default: default
- name: deployment.environment
default: default
- name: signoz.collector.id
- name: service.version
- name: browser.platform
- name: browser.mobile
- name: k8s.cluster.name
- name: k8s.node.name
- name: k8s.namespace.name
- name: host.name
- name: host.type
- name: container.name
extensions:
health_check:
endpoint: 0.0.0.0:13133
pprof:
endpoint: 0.0.0.0:1777
exporters:
clickhousetraces:
datasource: tcp://clickhouse:9000/signoz_traces
low_cardinal_exception_grouping: ${env:LOW_CARDINAL_EXCEPTION_GROUPING}
use_new_schema: true
signozclickhousemetrics:
dsn: tcp://clickhouse:9000/signoz_metrics
clickhouselogsexporter:
dsn: tcp://clickhouse:9000/signoz_logs
timeout: 10s
use_new_schema: true
signozclickhousemeter:
dsn: tcp://clickhouse:9000/signoz_meter
timeout: 45s
sending_queue:
enabled: false
service:
telemetry:
logs:
encoding: json
extensions:
- health_check
- pprof
pipelines:
traces:
receivers: [otlp]
processors: [signozspanmetrics/delta, batch]
exporters: [clickhousetraces, signozmeter]
metrics:
receivers: [otlp]
processors: [batch]
exporters: [signozclickhousemetrics, signozmeter]
metrics/prometheus:
receivers: [prometheus]
processors: [batch]
exporters: [signozclickhousemetrics, signozmeter]
logs:
receivers: [otlp]
processors: [batch]
exporters: [clickhouselogsexporter, signozmeter]
metrics/meter:
receivers: [signozmeter]
processors: [batch/meter]
exporters: [signozclickhousemeter]

View File

@@ -0,0 +1 @@
server_endpoint: ws://signoz:4320/v1/opamp

View File

@@ -0,0 +1,16 @@
global:
scrape_interval: 5s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files: []
scrape_configs: []
remote_read:
- url: tcp://clickhouse:9000/signoz_metrics

View File

38
fluxer_devops/signoz/deploy.sh Executable file
View File

@@ -0,0 +1,38 @@
#!/usr/bin/env sh
# Copyright (C) 2026 Fluxer Contributors
#
# This file is part of Fluxer.
#
# Fluxer is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Fluxer is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with Fluxer. If not, see <https://www.gnu.org/licenses/>.
set -eu
SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
STACK=${STACK:-fluxer-signoz}
SIGNOZ_IMAGE_TAG=${SIGNOZ_IMAGE_TAG:-v0.105.1}
if ! docker network inspect fluxer-shared >/dev/null 2>&1; then
docker network create -d overlay fluxer-shared
fi
if [ "$(docker info --format '{{.Swarm.LocalNodeState}}')" != "active" ]; then
echo "Docker swarm must be active for stack deployment. Run 'docker swarm init' and try again."
exit 1
fi
export STACK
export SIGNOZ_IMAGE_TAG
docker stack deploy --with-registry-auth -c "$SCRIPT_DIR/compose.yaml" "$STACK"