refactor progress
This commit is contained in:
407
fluxer_devops/signoz/alerts/critical-alerts.json
Normal file
407
fluxer_devops/signoz/alerts/critical-alerts.json
Normal file
@@ -0,0 +1,407 @@
|
||||
{
|
||||
"name": "Fluxer Critical Alerts",
|
||||
"description": "Critical alerts for Fluxer services",
|
||||
"version": 2,
|
||||
"alerts": [
|
||||
{
|
||||
"id": "high-api-error-rate",
|
||||
"name": "High API Error Rate",
|
||||
"type": "metric",
|
||||
"condition": {
|
||||
"query": "sum(rate(http_server_request_count{service_name='fluxer-api',http_response_status_code=~'5..'}[5m])) > 10",
|
||||
"evaluation_interval": "1m",
|
||||
"for": "5m"
|
||||
},
|
||||
"severity": "critical",
|
||||
"annotations": {
|
||||
"summary": "API error rate is above 10 req/s",
|
||||
"description": "The fluxer-api service is experiencing a high error rate (5xx responses). This may indicate a service degradation or outage."
|
||||
},
|
||||
"labels": {
|
||||
"service": "fluxer-api",
|
||||
"alert_type": "error_rate"
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "notification",
|
||||
"channel": "slack",
|
||||
"target": "#alerts-critical"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "high-api-latency",
|
||||
"name": "High API Latency",
|
||||
"type": "metric",
|
||||
"condition": {
|
||||
"query": "histogram_quantile(0.95, sum(rate(http_server_request_duration_bucket{service_name='fluxer-api'}[5m])) > 1000",
|
||||
"evaluation_interval": "1m",
|
||||
"for": "10m"
|
||||
},
|
||||
"severity": "warning",
|
||||
"annotations": {
|
||||
"summary": "API P95 latency is above 1000ms",
|
||||
"description": "The fluxer-api service is experiencing high latency. 95% of requests are taking longer than 1 second."
|
||||
},
|
||||
"labels": {
|
||||
"service": "fluxer-api",
|
||||
"alert_type": "latency"
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "notification",
|
||||
"channel": "slack",
|
||||
"target": "#alerts-warning"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "queue-depth-high",
|
||||
"name": "Queue Depth Too High",
|
||||
"type": "metric",
|
||||
"condition": {
|
||||
"query": "fluxer_queue_depth > 10000",
|
||||
"evaluation_interval": "1m",
|
||||
"for": "15m"
|
||||
},
|
||||
"severity": "warning",
|
||||
"annotations": {
|
||||
"summary": "Queue depth is above 10,000 jobs",
|
||||
"description": "The job queue has accumulated more than 10,000 jobs. This may indicate processing is slower than job arrival."
|
||||
},
|
||||
"labels": {
|
||||
"service": "fluxer-queue",
|
||||
"alert_type": "queue_depth"
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "notification",
|
||||
"channel": "slack",
|
||||
"target": "#alerts-warning"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "queue-dlq-rate",
|
||||
"name": "High Dead Letter Queue Rate",
|
||||
"type": "metric",
|
||||
"condition": {
|
||||
"query": "sum(rate(fluxer_queue_dead_letter[5m])) > 5",
|
||||
"evaluation_interval": "1m",
|
||||
"for": "10m"
|
||||
},
|
||||
"severity": "critical",
|
||||
"annotations": {
|
||||
"summary": "DLQ rate is above 5 jobs/sec",
|
||||
"description": "Jobs are being moved to the dead letter queue at a high rate. This may indicate persistent job failures."
|
||||
},
|
||||
"labels": {
|
||||
"service": "fluxer-queue",
|
||||
"alert_type": "dlq_rate"
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "notification",
|
||||
"channel": "slack",
|
||||
"target": "#alerts-critical"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "gateway-connection-drop",
|
||||
"name": "Gateway Connection Drop Rate",
|
||||
"type": "metric",
|
||||
"condition": {
|
||||
"query": "rate(gateway_websocket_disconnections[1m]) / rate(gateway_websocket_connections[1m]) > 0.5",
|
||||
"evaluation_interval": "1m",
|
||||
"for": "5m"
|
||||
},
|
||||
"severity": "critical",
|
||||
"annotations": {
|
||||
"summary": "Gateway disconnect rate exceeds 50% of connect rate",
|
||||
"description": "WebSocket connections are dropping at an unusually high rate. This may indicate network issues or service instability."
|
||||
},
|
||||
"labels": {
|
||||
"service": "fluxer-gateway",
|
||||
"alert_type": "connection_stability"
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "notification",
|
||||
"channel": "slack",
|
||||
"target": "#alerts-critical"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "gateway-rpc-latency-high",
|
||||
"name": "Gateway RPC Latency High",
|
||||
"type": "metric",
|
||||
"condition": {
|
||||
"query": "gateway_rpc_latency_p95 > 500",
|
||||
"evaluation_interval": "1m",
|
||||
"for": "10m"
|
||||
},
|
||||
"severity": "warning",
|
||||
"annotations": {
|
||||
"summary": "Gateway RPC P95 latency above 500ms",
|
||||
"description": "RPC calls from gateway to backend are experiencing high latency."
|
||||
},
|
||||
"labels": {
|
||||
"service": "fluxer-gateway",
|
||||
"alert_type": "latency"
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "notification",
|
||||
"channel": "slack",
|
||||
"target": "#alerts-warning"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "media-proxy-error-rate",
|
||||
"name": "Media Proxy High Error Rate",
|
||||
"type": "metric",
|
||||
"condition": {
|
||||
"query": "sum(rate(media_proxy_failure{service_name='fluxer-media-proxy'}[5m])) / sum(rate(http_server_request_count{service_name='fluxer-media-proxy'}[5m])) > 0.1",
|
||||
"evaluation_interval": "1m",
|
||||
"for": "10m"
|
||||
},
|
||||
"severity": "warning",
|
||||
"annotations": {
|
||||
"summary": "Media proxy error rate above 10%",
|
||||
"description": "The media proxy is failing more than 10% of requests. This may indicate origin issues or cache problems."
|
||||
},
|
||||
"labels": {
|
||||
"service": "fluxer-media-proxy",
|
||||
"alert_type": "error_rate"
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "notification",
|
||||
"channel": "slack",
|
||||
"target": "#alerts-warning"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "media-proxy-timeout-rate",
|
||||
"name": "Media Proxy High Timeout Rate",
|
||||
"type": "metric",
|
||||
"condition": {
|
||||
"query": "sum(rate(media_proxy_failure{error_type='timeout'}[5m])) > 5",
|
||||
"evaluation_interval": "1m",
|
||||
"for": "5m"
|
||||
},
|
||||
"severity": "warning",
|
||||
"annotations": {
|
||||
"summary": "Media proxy timeout rate above 5 req/s",
|
||||
"description": "The media proxy is experiencing a high rate of timeouts. This may indicate network issues or slow origin servers."
|
||||
},
|
||||
"labels": {
|
||||
"service": "fluxer-media-proxy",
|
||||
"alert_type": "timeout"
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "notification",
|
||||
"channel": "slack",
|
||||
"target": "#alerts-warning"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "telemetry-ingestion-stopped",
|
||||
"name": "Telemetry Ingestion Stopped",
|
||||
"type": "metric",
|
||||
"condition": {
|
||||
"query": "increase(signoz_traces_signoz_index_v2[15m]) == 0",
|
||||
"evaluation_interval": "1m",
|
||||
"for": "5m"
|
||||
},
|
||||
"severity": "critical",
|
||||
"annotations": {
|
||||
"summary": "No traces being ingested",
|
||||
"description": "The SigNoz collector has not received any traces in the last 15 minutes. This may indicate a collector issue or service instrumentation failure."
|
||||
},
|
||||
"labels": {
|
||||
"service": "signoz",
|
||||
"alert_type": "telemetry"
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "notification",
|
||||
"channel": "slack",
|
||||
"target": "#alerts-critical"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "cron-job-overdue",
|
||||
"name": "Cron Job Overdue",
|
||||
"type": "metric",
|
||||
"condition": {
|
||||
"query": "time() - max(fluxer_queue_cron_tick_timestamp by (cron)) > 3600",
|
||||
"evaluation_interval": "5m",
|
||||
"for": "5m"
|
||||
},
|
||||
"severity": "warning",
|
||||
"annotations": {
|
||||
"summary": "Cron job has not executed in over 1 hour",
|
||||
"description": "A scheduled cron job has not run in over an hour. This may indicate a hung cron process or scheduling issue."
|
||||
},
|
||||
"labels": {
|
||||
"service": "fluxer-queue",
|
||||
"alert_type": "cron"
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "notification",
|
||||
"channel": "slack",
|
||||
"target": "#alerts-warning"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "csam-match-detected",
|
||||
"name": "CSAM Match Detected",
|
||||
"type": "metric",
|
||||
"condition": {
|
||||
"query": "sum(rate(fluxer_csam_matches_total{service_name='fluxer-api'}[1m])) > 0",
|
||||
"evaluation_interval": "1m",
|
||||
"for": "0m"
|
||||
},
|
||||
"severity": "critical",
|
||||
"annotations": {
|
||||
"summary": "CSAM content has been detected",
|
||||
"description": "CSAM content has been detected. Immediate review required."
|
||||
},
|
||||
"labels": {
|
||||
"service": "fluxer-api",
|
||||
"alert_type": "csam_match"
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "notification",
|
||||
"channel": "slack",
|
||||
"target": "#alerts-critical"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "photodna-api-error-rate-high",
|
||||
"name": "PhotoDNA API Error Rate High",
|
||||
"type": "metric",
|
||||
"condition": {
|
||||
"query": "sum(rate(fluxer_csam_photodna_api_total{service_name='fluxer-api',status='error'}[5m])) / sum(rate(fluxer_csam_photodna_api_total{service_name='fluxer-api'}[5m])) > 0.1",
|
||||
"evaluation_interval": "1m",
|
||||
"for": "5m"
|
||||
},
|
||||
"severity": "warning",
|
||||
"annotations": {
|
||||
"summary": "PhotoDNA API error rate exceeds 10%",
|
||||
"description": "PhotoDNA API error rate exceeds 10%"
|
||||
},
|
||||
"labels": {
|
||||
"service": "fluxer-api",
|
||||
"alert_type": "photodna_error_rate"
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "notification",
|
||||
"channel": "slack",
|
||||
"target": "#alerts-warning"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "ncmec-submission-failure",
|
||||
"name": "NCMEC Submission Failure",
|
||||
"type": "metric",
|
||||
"condition": {
|
||||
"query": "sum(rate(fluxer_csam_ncmec_submissions{service_name='fluxer-api',status='error'}[5m])) > 0",
|
||||
"evaluation_interval": "1m",
|
||||
"for": "5m"
|
||||
},
|
||||
"severity": "critical",
|
||||
"annotations": {
|
||||
"summary": "NCMEC report submission has failed",
|
||||
"description": "NCMEC report submission has failed. Manual intervention required."
|
||||
},
|
||||
"labels": {
|
||||
"service": "fluxer-api",
|
||||
"alert_type": "ncmec_submission"
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "notification",
|
||||
"channel": "slack",
|
||||
"target": "#alerts-critical"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "csam-scan-failure-rate-high",
|
||||
"name": "CSAM Scan Failure Rate High",
|
||||
"type": "metric",
|
||||
"condition": {
|
||||
"query": "sum(rate(fluxer_csam_scans_total{service_name='fluxer-api',status='error'}[5m])) / sum(rate(fluxer_csam_scans_total{service_name='fluxer-api'}[5m])) > 0.05",
|
||||
"evaluation_interval": "1m",
|
||||
"for": "5m"
|
||||
},
|
||||
"severity": "warning",
|
||||
"annotations": {
|
||||
"summary": "CSAM scan failure rate exceeds 5%",
|
||||
"description": "CSAM scan failure rate exceeds 5%"
|
||||
},
|
||||
"labels": {
|
||||
"service": "fluxer-api",
|
||||
"alert_type": "csam_scan_failure_rate"
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "notification",
|
||||
"channel": "slack",
|
||||
"target": "#alerts-warning"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "photodna-api-latency-high",
|
||||
"name": "PhotoDNA API Latency High",
|
||||
"type": "metric",
|
||||
"condition": {
|
||||
"query": "histogram_quantile(0.95, sum(rate(fluxer_csam_photodna_api_duration_ms_bucket{service_name='fluxer-api'}[5m])) by (le)) > 5000",
|
||||
"evaluation_interval": "1m",
|
||||
"for": "5m"
|
||||
},
|
||||
"severity": "warning",
|
||||
"annotations": {
|
||||
"summary": "PhotoDNA API p95 latency exceeds 5 seconds",
|
||||
"description": "PhotoDNA API p95 latency exceeds 5 seconds"
|
||||
},
|
||||
"labels": {
|
||||
"service": "fluxer-api",
|
||||
"alert_type": "photodna_latency"
|
||||
},
|
||||
"actions": [
|
||||
{
|
||||
"type": "notification",
|
||||
"channel": "slack",
|
||||
"target": "#alerts-warning"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"notification_channels": {
|
||||
"slack": {
|
||||
"type": "webhook",
|
||||
"url": "${ALERT_WEBHOOK_URL}",
|
||||
"channel_mapping": {
|
||||
"critical": "#alerts-critical",
|
||||
"warning": "#alerts-warning"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
329
fluxer_devops/signoz/alerts/default-alerts.yaml
Normal file
329
fluxer_devops/signoz/alerts/default-alerts.yaml
Normal file
@@ -0,0 +1,329 @@
|
||||
groups:
|
||||
- name: fluxer_api_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: FluxerHighErrorRate
|
||||
expr: |
|
||||
(
|
||||
rate(http_server_request_count{http_response_status_code=~"5.."}[5m])
|
||||
/
|
||||
rate(http_server_request_count[5m])
|
||||
) > 0.05
|
||||
and rate(http_server_request_count[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: fluxer-api
|
||||
alert_type: error_rate
|
||||
annotations:
|
||||
summary: 'High error rate on {{ $labels.service_name }}'
|
||||
description: 'Error rate is above 5% (minimum 10 requests/5m) on {{ $labels.service_name }}. Current value: {{ $value | humanizePercentage }}'
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/high-error-rate'
|
||||
|
||||
- alert: FluxerElevatedErrorRate
|
||||
expr: |
|
||||
(
|
||||
rate(http_server_request_count{http_response_status_code=~"5.."}[5m])
|
||||
/
|
||||
rate(http_server_request_count[5m])
|
||||
) > 0.01
|
||||
and rate(http_server_request_count[5m]) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxer-api
|
||||
alert_type: error_rate
|
||||
annotations:
|
||||
summary: 'Elevated error rate on {{ $labels.service_name }}'
|
||||
description: 'Error rate is above 1% on {{ $labels.service_name }}. Current value: {{ $value | humanizePercentage }}'
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/high-error-rate'
|
||||
|
||||
- name: fluxer_queue_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: FluxerQueueDepthCritical
|
||||
expr: |
|
||||
fluxer_queue_depth{service_name="fluxer-queue"} > 10000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: fluxer-queue
|
||||
alert_type: queue_depth
|
||||
annotations:
|
||||
summary: 'Queue depth critically high for {{ $labels.queue_name }}'
|
||||
description: 'Queue {{ $labels.queue_name }} has {{ $value }} jobs pending (threshold: 10,000). Jobs may be delayed or processing is stalled.'
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/queue-depth-critical'
|
||||
|
||||
- alert: FluxerQueueDepthElevated
|
||||
expr: |
|
||||
fluxer_queue_depth{service_name="fluxer-queue"} > 5000
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxer-queue
|
||||
alert_type: queue_depth
|
||||
annotations:
|
||||
summary: 'Queue depth elevated for {{ $labels.queue_name }}'
|
||||
description: 'Queue {{ $labels.queue_name }} has {{ $value }} jobs pending (threshold: 5,000). Monitor for escalation.'
|
||||
|
||||
- alert: FluxerDLQRateCritical
|
||||
expr: |
|
||||
sum(rate(fluxer_queue_dead_letter{service_name="fluxer-queue"}[5m])) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: fluxer-queue
|
||||
alert_type: dlq_rate
|
||||
annotations:
|
||||
summary: 'High dead letter queue rate'
|
||||
description: 'Jobs are failing and moving to DLQ at rate {{ $value | humanize }} jobs/sec. Check job failures and error logs.'
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/high-dlq-rate'
|
||||
|
||||
- name: fluxer_gateway_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: FluxerGatewayConnectionDropCritical
|
||||
expr: |
|
||||
sum(rate(gateway_websocket_disconnections{reason="error"}[1m])) by (service_name) > 10
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
service: fluxer-gateway
|
||||
alert_type: connection_drop
|
||||
annotations:
|
||||
summary: 'Critical WebSocket error disconnect rate'
|
||||
description: 'Gateway experiencing {{ $value | humanize }} error disconnects/min. This may indicate service instability or network issues.'
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/gateway-connection-drop'
|
||||
|
||||
- alert: FluxerGatewayDisconnectElevated
|
||||
expr: |
|
||||
sum(rate(gateway_websocket_disconnections{reason="error"}[1m])) by (service_name) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxer-gateway
|
||||
alert_type: connection_drop
|
||||
annotations:
|
||||
summary: 'Elevated WebSocket error disconnect rate'
|
||||
description: 'Gateway experiencing {{ $value | humanize }} error disconnects/min. Monitor for escalation.'
|
||||
|
||||
- alert: FluxerGatewayDisconnectRatioHigh
|
||||
expr: |
|
||||
(
|
||||
sum(rate(gateway_websocket_disconnections{reason="error"}[5m])) by (service_name)
|
||||
/
|
||||
sum(rate(gateway_websocket_connections[5m])) by (service_name)
|
||||
) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: fluxer-gateway
|
||||
alert_type: disconnect_ratio
|
||||
annotations:
|
||||
summary: 'Gateway disconnect ratio above 10%'
|
||||
description: 'Error disconnects represent {{ $value | humanizePercentage }} of new connections. Check gateway stability.'
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/gateway-connection-drop'
|
||||
|
||||
- alert: FluxerGatewayRPCLatencyHigh
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(gateway_rpc_latency_bucket{service_name="fluxer-gateway"}[5m])) by (le)
|
||||
) > 500
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxer-gateway
|
||||
alert_type: rpc_latency
|
||||
annotations:
|
||||
summary: 'Gateway RPC P95 latency above 500ms'
|
||||
description: 'Gateway RPC calls experiencing high latency. Current P95: {{ $value | humanize }}ms'
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/gateway-rpc-latency'
|
||||
|
||||
- name: fluxer_log_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: FluxerLogErrorSpikeCritical
|
||||
expr: |
|
||||
sum(rate(logs_count{severity_text="ERROR"}[5m])) by (service_name) > 50
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
alert_type: log_error_spike
|
||||
annotations:
|
||||
summary: 'Critical error log volume spike on {{ $labels.service_name }}'
|
||||
description: 'Service {{ $labels.service_name }} logging {{ $value | humanize }} errors/sec. Check logs and traces for root cause.'
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/log-error-spike'
|
||||
|
||||
- alert: FluxerLogErrorElevated
|
||||
expr: |
|
||||
sum(rate(logs_count{severity_text="ERROR"}[5m])) by (service_name) > 20
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_type: log_error_elevated
|
||||
annotations:
|
||||
summary: 'Elevated error log volume on {{ $labels.service_name }}'
|
||||
description: 'Service {{ $labels.service_name }} logging {{ $value | humanize }} errors/sec. Monitor for escalation.'
|
||||
|
||||
- alert: FluxerLogWarningElevated
|
||||
expr: |
|
||||
sum(rate(logs_count{severity_text="WARN"}[5m])) by (service_name) > 100
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_type: log_warning_elevated
|
||||
annotations:
|
||||
summary: 'Elevated warning log volume on {{ $labels.service_name }}'
|
||||
description: 'Service {{ $labels.service_name }} logging {{ $value | humanize }} warnings/sec. Review warning patterns.'
|
||||
|
||||
- name: fluxer_api_performance_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: FluxerAPILatencyCritical
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(http_server_request_duration_bucket{service_name="fluxer-api"}[5m])) by (le, http_route)
|
||||
) > 2000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: fluxer-api
|
||||
alert_type: latency
|
||||
annotations:
|
||||
summary: 'Critical API latency on route {{ $labels.http_route }}'
|
||||
description: 'P95 latency for route {{ $labels.http_route }} is above 2 seconds. Current: {{ $value | humanize }}ms'
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/high-api-latency'
|
||||
|
||||
- alert: FluxerAPILatencyElevated
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(http_server_request_duration_bucket{service_name="fluxer-api"}[5m])) by (le, http_route)
|
||||
) > 1000
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxer-api
|
||||
alert_type: latency
|
||||
annotations:
|
||||
summary: 'Elevated API latency on route {{ $labels.http_route }}'
|
||||
description: 'P95 latency for route {{ $labels.http_route }} is above 1 second. Current: {{ $value | humanize }}ms'
|
||||
|
||||
- name: fluxer_database_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: FluxerDBLatencyCritical
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(db_query_latency_bucket[5m])) by (le, query_type)
|
||||
) > 1000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
alert_type: database_latency
|
||||
annotations:
|
||||
summary: 'Critical database query latency for {{ $labels.query_type }}'
|
||||
description: 'P95 {{ $labels.query_type }} query latency above 1 second. Current: {{ $value | humanize }}ms'
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/database-latency'
|
||||
|
||||
- alert: FluxerDBConnectionPoolHigh
|
||||
expr: |
|
||||
db_connection_pool_active / db_connection_pool_max > 0.8
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_type: connection_pool
|
||||
annotations:
|
||||
summary: 'Database connection pool usage above 80%'
|
||||
description: 'Connection pool at {{ $value | humanizePercentage }} capacity. May lead to connection waits.'
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/connection-pool'
|
||||
|
||||
- name: fluxer_cache_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: FluxerCacheHitRateLow
|
||||
expr: |
|
||||
sum(rate(cache_operation{status="hit"}[5m])) by (cache_name)
|
||||
/
|
||||
sum(rate(cache_operation{status=~"hit|miss"}[5m])) by (cache_name) < 0.5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
alert_type: cache_efficiency
|
||||
annotations:
|
||||
summary: 'Low cache hit rate for {{ $labels.cache_name }}'
|
||||
description: 'Cache {{ $labels.cache_name }} hit rate below 50%. Current: {{ $value | humanizePercentage }}'
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/low-cache-hit-rate'
|
||||
|
||||
- name: fluxer_worker_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: FluxerWorkerFailureRateCritical
|
||||
expr: |
|
||||
sum(rate(fluxer_worker_task_failure[5m])) by (task_name) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
alert_type: worker_failure
|
||||
annotations:
|
||||
summary: 'Critical worker task failure rate for {{ $labels.task_name }}'
|
||||
description: 'Worker task {{ $labels.task_name }} failing at {{ $value | humanize }} tasks/sec. Check task logs.'
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/worker-failures'
|
||||
|
||||
- alert: FluxerCronJobOverdue
|
||||
expr: |
|
||||
time() - max(fluxer_queue_cron_tick_timestamp by (cron)) > 3600
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxer-queue
|
||||
alert_type: cron
|
||||
annotations:
|
||||
summary: 'Cron job {{ $labels.cron }} has not executed in over 1 hour'
|
||||
description: "Scheduled cron job hasn't run since {{ $value | humanizeTimestamp }}. May indicate hung process."
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/cron-overdue'
|
||||
|
||||
- name: fluxer_telemetry_alerts
|
||||
interval: 60s
|
||||
rules:
|
||||
- alert: FluxerTelemetryIngestionStopped
|
||||
expr: |
|
||||
increase(signoz_traces_signoz_index_v2[15m]) == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
alert_type: telemetry
|
||||
annotations:
|
||||
summary: 'No traces being ingested'
|
||||
description: "SigNoz collector hasn't received traces in 15 minutes. Check collector health and service instrumentation."
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/telemetry-down'
|
||||
|
||||
- name: fluxer_media_proxy_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: FluxerMediaProxyErrorRate
|
||||
expr: |
|
||||
sum(rate(media_proxy_failure{service_name="fluxer-media-proxy"}[5m]))
|
||||
/
|
||||
sum(rate(http_server_request_count{service_name="fluxer-media-proxy"}[5m])) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxer-media-proxy
|
||||
alert_type: error_rate
|
||||
annotations:
|
||||
summary: 'Media proxy error rate above 10%'
|
||||
description: 'Media proxy failing {{ $value | humanizePercentage }} of requests. Check origin servers and cache.'
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/media-proxy-errors'
|
||||
|
||||
- alert: FluxerMediaProxyTimeoutRate
|
||||
expr: |
|
||||
sum(rate(media_proxy_failure{error_type="timeout"}[5m])) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxer-media-proxy
|
||||
alert_type: timeout
|
||||
annotations:
|
||||
summary: 'Media proxy timeout rate above 5 req/s'
|
||||
description: 'Media proxy experiencing high timeout rate. May indicate network issues or slow origins.'
|
||||
runbook: 'https://docs.fluxer.dev/runbooks/media-proxy-timeouts'
|
||||
213
fluxer_devops/signoz/compose.yaml
Normal file
213
fluxer_devops/signoz/compose.yaml
Normal file
@@ -0,0 +1,213 @@
|
||||
x-common: &common
|
||||
networks:
|
||||
- fluxer-shared
|
||||
logging:
|
||||
options:
|
||||
max-size: 50m
|
||||
max-file: '3'
|
||||
|
||||
x-deploy-base: &deploy_base
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 5s
|
||||
max_attempts: 3
|
||||
|
||||
x-clickhouse-defaults: &clickhouse_defaults
|
||||
<<: *common
|
||||
image: clickhouse/clickhouse-server:25.5.6
|
||||
tty: true
|
||||
environment:
|
||||
- CLICKHOUSE_SKIP_USER_SETUP=1
|
||||
deploy:
|
||||
<<: *deploy_base
|
||||
labels:
|
||||
signoz.io/scrape: 'true'
|
||||
signoz.io/port: '9363'
|
||||
signoz.io/path: '/metrics'
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD
|
||||
- wget
|
||||
- --spider
|
||||
- -q
|
||||
- 0.0.0.0:8123/ping
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
ulimits:
|
||||
nproc: 65535
|
||||
nofile:
|
||||
soft: 262144
|
||||
hard: 262144
|
||||
|
||||
x-zookeeper-defaults: &zookeeper_defaults
|
||||
<<: *common
|
||||
image: signoz/zookeeper:3.7.1
|
||||
user: root
|
||||
deploy:
|
||||
<<: *deploy_base
|
||||
labels:
|
||||
signoz.io/scrape: 'true'
|
||||
signoz.io/port: '9141'
|
||||
signoz.io/path: '/metrics'
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD-SHELL
|
||||
- curl -s -m 2 http://localhost:8080/commands/ruok | grep error | grep null
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
services:
|
||||
init-clickhouse:
|
||||
<<: *common
|
||||
image: clickhouse/clickhouse-server:25.5.6
|
||||
command:
|
||||
- bash
|
||||
- -c
|
||||
- |
|
||||
version="v0.0.1"
|
||||
node_os=$$(uname -s | tr '[:upper:]' '[:lower:]')
|
||||
node_arch=$$(uname -m | sed s/aarch64/arm64/ | sed s/x86_64/amd64/)
|
||||
echo "Fetching histogram-binary for $${node_os}/$${node_arch}"
|
||||
cd /tmp
|
||||
wget -O histogram-quantile.tar.gz "https://github.com/SigNoz/signoz/releases/download/histogram-quantile%2F$${version}/histogram-quantile_$${node_os}_$${node_arch}.tar.gz"
|
||||
tar -xvzf histogram-quantile.tar.gz
|
||||
mv histogram-quantile /var/lib/clickhouse/user_scripts/histogramQuantile
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
volumes:
|
||||
- ./conf/clickhouse/user_scripts:/var/lib/clickhouse/user_scripts:rw
|
||||
|
||||
zookeeper-1:
|
||||
<<: *zookeeper_defaults
|
||||
environment:
|
||||
- ZOO_SERVER_ID=1
|
||||
- ALLOW_ANONYMOUS_LOGIN=yes
|
||||
- ZOO_AUTOPURGE_INTERVAL=1
|
||||
- ZOO_ENABLE_PROMETHEUS_METRICS=yes
|
||||
- ZOO_PROMETHEUS_METRICS_PORT_NUMBER=9141
|
||||
volumes:
|
||||
- zookeeper-1:/bitnami/zookeeper
|
||||
|
||||
clickhouse:
|
||||
<<: *clickhouse_defaults
|
||||
hostname: clickhouse
|
||||
configs:
|
||||
- source: clickhouse-config
|
||||
target: /etc/clickhouse-server/config.xml
|
||||
- source: clickhouse-users
|
||||
target: /etc/clickhouse-server/users.xml
|
||||
- source: clickhouse-custom-function
|
||||
target: /etc/clickhouse-server/custom-function.xml
|
||||
- source: clickhouse-cluster
|
||||
target: /etc/clickhouse-server/config.d/cluster.xml
|
||||
volumes:
|
||||
- clickhouse:/var/lib/clickhouse/
|
||||
|
||||
schema-migrator:
|
||||
<<: *common
|
||||
image: signoz/signoz-schema-migrator:${OTELCOL_TAG:-v0.129.12}
|
||||
entrypoint: sh
|
||||
command:
|
||||
- -c
|
||||
- /signoz-schema-migrator sync --dsn=tcp://clickhouse:9000 --up= && /signoz-schema-migrator async --dsn=tcp://clickhouse:9000 --up=
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 5s
|
||||
|
||||
signoz:
|
||||
<<: *common
|
||||
image: signoz/signoz:${SIGNOZ_IMAGE_TAG:-v0.108.0}
|
||||
command:
|
||||
- --config=/root/config/prometheus.yml
|
||||
environment:
|
||||
- SIGNOZ_ALERTMANAGER_PROVIDER=signoz
|
||||
- SIGNOZ_TELEMETRYSTORE_CLICKHOUSE_DSN=tcp://clickhouse:9000
|
||||
- SIGNOZ_SQLSTORE_SQLITE_PATH=/var/lib/signoz/signoz.db
|
||||
- DASHBOARDS_PATH=/root/config/dashboards
|
||||
- STORAGE=clickhouse
|
||||
- GODEBUG=netdns=go
|
||||
- TELEMETRY_ENABLED=true
|
||||
- DEPLOYMENT_TYPE=docker-swarm
|
||||
- DOT_METRICS_ENABLED=true
|
||||
configs:
|
||||
- source: signoz-prometheus-config
|
||||
target: /root/config/prometheus.yml
|
||||
volumes:
|
||||
- sqlite:/var/lib/signoz/
|
||||
- ./dashboards:/root/config/dashboards:ro
|
||||
deploy:
|
||||
<<: *deploy_base
|
||||
replicas: 1
|
||||
labels:
|
||||
- 'caddy=signoz.fluxer.app'
|
||||
- 'caddy.reverse_proxy={{upstreams 8080}}'
|
||||
- 'caddy.header.Strict-Transport-Security="max-age=31536000; includeSubDomains; preload"'
|
||||
- 'caddy.header.X-Xss-Protection="1; mode=block"'
|
||||
- 'caddy.header.X-Content-Type-Options=nosniff'
|
||||
- 'caddy.header.Referrer-Policy=strict-origin-when-cross-origin'
|
||||
- 'caddy.header.X-Frame-Options=DENY'
|
||||
healthcheck:
|
||||
test:
|
||||
- CMD
|
||||
- wget
|
||||
- --spider
|
||||
- -q
|
||||
- localhost:8080/api/v1/health
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
|
||||
otel-collector:
|
||||
<<: *common
|
||||
image: signoz/signoz-otel-collector:${OTELCOL_TAG:-v0.129.12}
|
||||
command:
|
||||
- --config=/etc/otel-collector-config.yaml
|
||||
- --manager-config=/etc/manager-config.yaml
|
||||
- --copy-path=/var/tmp/collector-config.yaml
|
||||
- --feature-gates=-pkg.translator.prometheus.NormalizeName
|
||||
configs:
|
||||
- source: otel-collector-config
|
||||
target: /etc/otel-collector-config.yaml
|
||||
- source: otel-manager-config
|
||||
target: /etc/manager-config.yaml
|
||||
environment:
|
||||
- OTEL_RESOURCE_ATTRIBUTES=host.name={{.Node.Hostname}},os.type={{.Node.Platform.OS}}
|
||||
- LOW_CARDINAL_EXCEPTION_GROUPING=false
|
||||
ports:
|
||||
- '4317:4317'
|
||||
- '4318:4318'
|
||||
deploy:
|
||||
<<: *deploy_base
|
||||
replicas: 3
|
||||
|
||||
networks:
|
||||
fluxer-shared:
|
||||
external: true
|
||||
|
||||
volumes:
|
||||
clickhouse:
|
||||
driver: local
|
||||
sqlite:
|
||||
driver: local
|
||||
zookeeper-1:
|
||||
driver: local
|
||||
|
||||
configs:
|
||||
clickhouse-config:
|
||||
file: ./conf/clickhouse/config.xml
|
||||
clickhouse-users:
|
||||
file: ./conf/clickhouse/users.xml
|
||||
clickhouse-custom-function:
|
||||
file: ./conf/clickhouse/custom-function.xml
|
||||
clickhouse-cluster:
|
||||
file: ./conf/clickhouse/cluster.xml
|
||||
signoz-prometheus-config:
|
||||
file: ./conf/signoz/prometheus.yml
|
||||
otel-collector-config:
|
||||
file: ./conf/signoz/otel-collector-config.yaml
|
||||
otel-manager-config:
|
||||
file: ./conf/signoz/otel-collector-opamp-config.yaml
|
||||
75
fluxer_devops/signoz/conf/clickhouse/cluster.xml
Normal file
75
fluxer_devops/signoz/conf/clickhouse/cluster.xml
Normal file
@@ -0,0 +1,75 @@
|
||||
<?xml version="1.0"?>
|
||||
<clickhouse>
|
||||
<!-- ZooKeeper is used to store metadata about replicas, when using Replicated tables.
|
||||
Optional. If you don't use replicated tables, you could omit that.
|
||||
|
||||
See https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/replication/
|
||||
-->
|
||||
<zookeeper>
|
||||
<node index="1">
|
||||
<host>zookeeper-1</host>
|
||||
<port>2181</port>
|
||||
</node>
|
||||
<!-- <node index="2">
|
||||
<host>zookeeper-2</host>
|
||||
<port>2181</port>
|
||||
</node>
|
||||
<node index="3">
|
||||
<host>zookeeper-3</host>
|
||||
<port>2181</port>
|
||||
</node> -->
|
||||
</zookeeper>
|
||||
|
||||
<!-- Configuration of clusters that could be used in Distributed tables.
|
||||
https://clickhouse.com/docs/en/operations/table_engines/distributed/
|
||||
-->
|
||||
<remote_servers>
|
||||
<cluster>
|
||||
<!-- Inter-server per-cluster secret for Distributed queries
|
||||
default: no secret (no authentication will be performed)
|
||||
|
||||
If set, then Distributed queries will be validated on shards, so at least:
|
||||
- such cluster should exist on the shard,
|
||||
- such cluster should have the same secret.
|
||||
|
||||
And also (and which is more important), the initial_user will
|
||||
be used as current user for the query.
|
||||
|
||||
Right now the protocol is pretty simple and it only takes into account:
|
||||
- cluster name
|
||||
- query
|
||||
|
||||
Also it will be nice if the following will be implemented:
|
||||
- source hostname (see interserver_http_host), but then it will depends from DNS,
|
||||
it can use IP address instead, but then the you need to get correct on the initiator node.
|
||||
- target hostname / ip address (same notes as for source hostname)
|
||||
- time-based security tokens
|
||||
-->
|
||||
<!-- <secret></secret> -->
|
||||
<shard>
|
||||
<!-- Optional. Whether to write data to just one of the replicas. Default: false (write data to all replicas). -->
|
||||
<!-- <internal_replication>false</internal_replication> -->
|
||||
<!-- Optional. Shard weight when writing data. Default: 1. -->
|
||||
<!-- <weight>1</weight> -->
|
||||
<replica>
|
||||
<host>clickhouse</host>
|
||||
<port>9000</port>
|
||||
<!-- Optional. Priority of the replica for load_balancing. Default: 1 (less value has more priority). -->
|
||||
<!-- <priority>1</priority> -->
|
||||
</replica>
|
||||
</shard>
|
||||
<!-- <shard>
|
||||
<replica>
|
||||
<host>clickhouse-2</host>
|
||||
<port>9000</port>
|
||||
</replica>
|
||||
</shard>
|
||||
<shard>
|
||||
<replica>
|
||||
<host>clickhouse-3</host>
|
||||
<port>9000</port>
|
||||
</replica>
|
||||
</shard> -->
|
||||
</cluster>
|
||||
</remote_servers>
|
||||
</clickhouse>
|
||||
8
fluxer_devops/signoz/conf/clickhouse/config.d/keeper.xml
Normal file
8
fluxer_devops/signoz/conf/clickhouse/config.d/keeper.xml
Normal file
@@ -0,0 +1,8 @@
|
||||
<clickhouse>
|
||||
<keeper_server>
|
||||
<tcp_port>9181</tcp_port>
|
||||
<server_id>1</server_id>
|
||||
<log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
|
||||
<snapshot_storage_path>/var/lib/clickhouse/coordination/snapshots</snapshot_storage_path>
|
||||
</keeper_server>
|
||||
</clickhouse>
|
||||
1142
fluxer_devops/signoz/conf/clickhouse/config.xml
Normal file
1142
fluxer_devops/signoz/conf/clickhouse/config.xml
Normal file
File diff suppressed because it is too large
Load Diff
21
fluxer_devops/signoz/conf/clickhouse/custom-function.xml
Normal file
21
fluxer_devops/signoz/conf/clickhouse/custom-function.xml
Normal file
@@ -0,0 +1,21 @@
|
||||
<functions>
|
||||
<function>
|
||||
<type>executable</type>
|
||||
<name>histogramQuantile</name>
|
||||
<return_type>Float64</return_type>
|
||||
<argument>
|
||||
<type>Array(Float64)</type>
|
||||
<name>buckets</name>
|
||||
</argument>
|
||||
<argument>
|
||||
<type>Array(Float64)</type>
|
||||
<name>counts</name>
|
||||
</argument>
|
||||
<argument>
|
||||
<type>Float64</type>
|
||||
<name>quantile</name>
|
||||
</argument>
|
||||
<format>CSV</format>
|
||||
<command>./histogramQuantile</command>
|
||||
</function>
|
||||
</functions>
|
||||
123
fluxer_devops/signoz/conf/clickhouse/users.xml
Normal file
123
fluxer_devops/signoz/conf/clickhouse/users.xml
Normal file
@@ -0,0 +1,123 @@
|
||||
<?xml version="1.0"?>
|
||||
<clickhouse>
|
||||
<!-- See also the files in users.d directory where the settings can be overridden. -->
|
||||
|
||||
<!-- Profiles of settings. -->
|
||||
<profiles>
|
||||
<!-- Default settings. -->
|
||||
<default>
|
||||
<!-- Maximum memory usage for processing single query, in bytes. -->
|
||||
<max_memory_usage>10000000000</max_memory_usage>
|
||||
|
||||
<!-- How to choose between replicas during distributed query processing.
|
||||
random - choose random replica from set of replicas with minimum number of errors
|
||||
nearest_hostname - from set of replicas with minimum number of errors, choose replica
|
||||
with minimum number of different symbols between replica's hostname and local hostname
|
||||
(Hamming distance).
|
||||
in_order - first live replica is chosen in specified order.
|
||||
first_or_random - if first replica one has higher number of errors, pick a random one from replicas with minimum number of errors.
|
||||
-->
|
||||
<load_balancing>random</load_balancing>
|
||||
</default>
|
||||
|
||||
<!-- Profile that allows only read queries. -->
|
||||
<readonly>
|
||||
<readonly>1</readonly>
|
||||
</readonly>
|
||||
</profiles>
|
||||
|
||||
<!-- Users and ACL. -->
|
||||
<users>
|
||||
<!-- If user name was not specified, 'default' user is used. -->
|
||||
<default>
|
||||
<!-- See also the files in users.d directory where the password can be overridden.
|
||||
|
||||
Password could be specified in plaintext or in SHA256 (in hex format).
|
||||
|
||||
If you want to specify password in plaintext (not recommended), place it in 'password' element.
|
||||
Example: <password>qwerty</password>.
|
||||
Password could be empty.
|
||||
|
||||
If you want to specify SHA256, place it in 'password_sha256_hex' element.
|
||||
Example: <password_sha256_hex>65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5</password_sha256_hex>
|
||||
Restrictions of SHA256: impossibility to connect to ClickHouse using MySQL JS client (as of July 2019).
|
||||
|
||||
If you want to specify double SHA1, place it in 'password_double_sha1_hex' element.
|
||||
Example: <password_double_sha1_hex>e395796d6546b1b65db9d665cd43f0e858dd4303</password_double_sha1_hex>
|
||||
|
||||
If you want to specify a previously defined LDAP server (see 'ldap_servers' in the main config) for authentication,
|
||||
place its name in 'server' element inside 'ldap' element.
|
||||
Example: <ldap><server>my_ldap_server</server></ldap>
|
||||
|
||||
If you want to authenticate the user via Kerberos (assuming Kerberos is enabled, see 'kerberos' in the main config),
|
||||
place 'kerberos' element instead of 'password' (and similar) elements.
|
||||
The name part of the canonical principal name of the initiator must match the user name for authentication to succeed.
|
||||
You can also place 'realm' element inside 'kerberos' element to further restrict authentication to only those requests
|
||||
whose initiator's realm matches it.
|
||||
Example: <kerberos />
|
||||
Example: <kerberos><realm>EXAMPLE.COM</realm></kerberos>
|
||||
|
||||
How to generate decent password:
|
||||
Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-'
|
||||
In first line will be password and in second - corresponding SHA256.
|
||||
|
||||
How to generate double SHA1:
|
||||
Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-'
|
||||
In first line will be password and in second - corresponding double SHA1.
|
||||
-->
|
||||
<password></password>
|
||||
|
||||
<!-- List of networks with open access.
|
||||
|
||||
To open access from everywhere, specify:
|
||||
<ip>::/0</ip>
|
||||
|
||||
To open access only from localhost, specify:
|
||||
<ip>::1</ip>
|
||||
<ip>127.0.0.1</ip>
|
||||
|
||||
Each element of list has one of the following forms:
|
||||
<ip> IP-address or network mask. Examples: 213.180.204.3 or 10.0.0.1/8 or 10.0.0.1/255.255.255.0
|
||||
2a02:6b8::3 or 2a02:6b8::3/64 or 2a02:6b8::3/ffff:ffff:ffff:ffff::.
|
||||
<host> Hostname. Example: server01.clickhouse.com.
|
||||
To check access, DNS query is performed, and all received addresses compared to peer address.
|
||||
<host_regexp> Regular expression for host names. Example, ^server\d\d-\d\d-\d\.clickhouse\.com$
|
||||
To check access, DNS PTR query is performed for peer address and then regexp is applied.
|
||||
Then, for result of PTR query, another DNS query is performed and all received addresses compared to peer address.
|
||||
Strongly recommended that regexp is ends with $
|
||||
All results of DNS requests are cached till server restart.
|
||||
-->
|
||||
<networks>
|
||||
<ip>::/0</ip>
|
||||
</networks>
|
||||
|
||||
<!-- Settings profile for user. -->
|
||||
<profile>default</profile>
|
||||
|
||||
<!-- Quota for user. -->
|
||||
<quota>default</quota>
|
||||
|
||||
<!-- User can create other users and grant rights to them. -->
|
||||
<!-- <access_management>1</access_management> -->
|
||||
</default>
|
||||
</users>
|
||||
|
||||
<!-- Quotas. -->
|
||||
<quotas>
|
||||
<!-- Name of quota. -->
|
||||
<default>
|
||||
<!-- Limits for time interval. You could specify many intervals with different limits. -->
|
||||
<interval>
|
||||
<!-- Length of interval. -->
|
||||
<duration>3600</duration>
|
||||
|
||||
<!-- No limits. Just calculate resource usage for time interval. -->
|
||||
<queries>0</queries>
|
||||
<errors>0</errors>
|
||||
<result_rows>0</result_rows>
|
||||
<read_rows>0</read_rows>
|
||||
<execution_time>0</execution_time>
|
||||
</interval>
|
||||
</default>
|
||||
</quotas>
|
||||
</clickhouse>
|
||||
109
fluxer_devops/signoz/conf/signoz/otel-collector-config.yaml
Normal file
109
fluxer_devops/signoz/conf/signoz/otel-collector-config.yaml
Normal file
@@ -0,0 +1,109 @@
|
||||
connectors:
|
||||
signozmeter:
|
||||
metrics_flush_interval: 1h
|
||||
dimensions:
|
||||
- name: service.name
|
||||
- name: deployment.environment
|
||||
- name: host.name
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
prometheus:
|
||||
config:
|
||||
global:
|
||||
scrape_interval: 60s
|
||||
scrape_configs:
|
||||
- job_name: otel-collector
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost:8888
|
||||
labels:
|
||||
job_name: otel-collector
|
||||
processors:
|
||||
batch:
|
||||
send_batch_size: 10000
|
||||
send_batch_max_size: 11000
|
||||
timeout: 10s
|
||||
batch/meter:
|
||||
send_batch_max_size: 25000
|
||||
send_batch_size: 20000
|
||||
timeout: 1s
|
||||
resourcedetection:
|
||||
detectors: [env, system]
|
||||
timeout: 2s
|
||||
signozspanmetrics/delta:
|
||||
metrics_exporter: signozclickhousemetrics
|
||||
metrics_flush_interval: 60s
|
||||
latency_histogram_buckets:
|
||||
[100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s]
|
||||
dimensions_cache_size: 100000
|
||||
aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA
|
||||
enable_exp_histogram: true
|
||||
dimensions:
|
||||
- name: service.namespace
|
||||
default: default
|
||||
- name: deployment.environment
|
||||
default: default
|
||||
- name: signoz.collector.id
|
||||
- name: service.version
|
||||
- name: browser.platform
|
||||
- name: browser.mobile
|
||||
- name: k8s.cluster.name
|
||||
- name: k8s.node.name
|
||||
- name: k8s.namespace.name
|
||||
- name: host.name
|
||||
- name: host.type
|
||||
- name: container.name
|
||||
extensions:
|
||||
health_check:
|
||||
endpoint: 0.0.0.0:13133
|
||||
pprof:
|
||||
endpoint: 0.0.0.0:1777
|
||||
exporters:
|
||||
clickhousetraces:
|
||||
datasource: tcp://clickhouse:9000/signoz_traces
|
||||
low_cardinal_exception_grouping: ${env:LOW_CARDINAL_EXCEPTION_GROUPING}
|
||||
use_new_schema: true
|
||||
signozclickhousemetrics:
|
||||
dsn: tcp://clickhouse:9000/signoz_metrics
|
||||
clickhouselogsexporter:
|
||||
dsn: tcp://clickhouse:9000/signoz_logs
|
||||
timeout: 10s
|
||||
use_new_schema: true
|
||||
signozclickhousemeter:
|
||||
dsn: tcp://clickhouse:9000/signoz_meter
|
||||
timeout: 45s
|
||||
sending_queue:
|
||||
enabled: false
|
||||
service:
|
||||
telemetry:
|
||||
logs:
|
||||
encoding: json
|
||||
extensions:
|
||||
- health_check
|
||||
- pprof
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [signozspanmetrics/delta, batch]
|
||||
exporters: [clickhousetraces, signozmeter]
|
||||
metrics:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [signozclickhousemetrics, signozmeter]
|
||||
metrics/prometheus:
|
||||
receivers: [prometheus]
|
||||
processors: [batch]
|
||||
exporters: [signozclickhousemetrics, signozmeter]
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [clickhouselogsexporter, signozmeter]
|
||||
metrics/meter:
|
||||
receivers: [signozmeter]
|
||||
processors: [batch/meter]
|
||||
exporters: [signozclickhousemeter]
|
||||
@@ -0,0 +1 @@
|
||||
server_endpoint: ws://signoz:4320/v1/opamp
|
||||
16
fluxer_devops/signoz/conf/signoz/prometheus.yml
Normal file
16
fluxer_devops/signoz/conf/signoz/prometheus.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
global:
|
||||
scrape_interval: 5s
|
||||
evaluation_interval: 15s
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
rule_files: []
|
||||
|
||||
scrape_configs: []
|
||||
|
||||
remote_read:
|
||||
- url: tcp://clickhouse:9000/signoz_metrics
|
||||
0
fluxer_devops/signoz/dashboards/.gitkeep
Normal file
0
fluxer_devops/signoz/dashboards/.gitkeep
Normal file
38
fluxer_devops/signoz/deploy.sh
Executable file
38
fluxer_devops/signoz/deploy.sh
Executable file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env sh
|
||||
|
||||
# Copyright (C) 2026 Fluxer Contributors
|
||||
#
|
||||
# This file is part of Fluxer.
|
||||
#
|
||||
# Fluxer is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# Fluxer is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with Fluxer. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
set -eu
|
||||
|
||||
SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
|
||||
STACK=${STACK:-fluxer-signoz}
|
||||
SIGNOZ_IMAGE_TAG=${SIGNOZ_IMAGE_TAG:-v0.105.1}
|
||||
|
||||
if ! docker network inspect fluxer-shared >/dev/null 2>&1; then
|
||||
docker network create -d overlay fluxer-shared
|
||||
fi
|
||||
|
||||
if [ "$(docker info --format '{{.Swarm.LocalNodeState}}')" != "active" ]; then
|
||||
echo "Docker swarm must be active for stack deployment. Run 'docker swarm init' and try again."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export STACK
|
||||
export SIGNOZ_IMAGE_TAG
|
||||
|
||||
docker stack deploy --with-registry-auth -c "$SCRIPT_DIR/compose.yaml" "$STACK"
|
||||
Reference in New Issue
Block a user