Self-Hosted Business Monitoring
Deploy a private end-to-end business monitoring system - event collection, storage, dashboards, funnel analysis, and alerting
Self-Hosted Business Monitoring System
Deploy a complete, private business monitoring platform that covers event collection, real-time dashboards, funnel analysis, and threshold-based alerting — all running within your own infrastructure.
Architecture Overview
┌─────────────────────────────────────────────────────────────────┐
│ Browser / App │
│ ┌─────────────────┐ ┌─────────────────┐ │
│ │ Event Tracking │ │ Error Tracking │ │
│ │ SDK │ │ (Sentry OSS) │ │
│ └────────┬─────────┘ └────────┬─────────┘ │
└───────────┼──────────────────────┼──────────────────────────────┘
│ │
▼ ▼
┌─────────────────────────────────────────────────────────────────┐
│ Ingestion │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Collector API (Node.js) │ │
│ │ - validate & enrich events │ │
│ │ - batch write to ClickHouse │ │
│ │ - forward errors to Sentry │ │
│ └─────────┬───────────────────────────────────────────────┘ │
│ │ │
│ ┌─────────▼─────────┐ ┌──────────────┐ ┌────────────────┐ │
│ │ ClickHouse │ │ Redis │ │ Sentry (OSS) │ │
│ │ events + metrics │ │ alert state │ │ error store │ │
│ └─────────┬─────────┘ └──────┬───────┘ └────────────────┘ │
│ │ │ │
│ ┌─────────▼────────────────────▼──────────────────────────┐ │
│ │ Query / Alert Engine │ │
│ └─────────┬───────────────────────────────────────────────┘ │
│ │ │
│ ┌─────────▼───────────────────────────────────────────────┐ │
│ │ Grafana │ │
│ │ - business dashboards │ │
│ │ - funnel & trend panels │ │
│ │ - built-in alerting │ │
│ └─────────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────────┘Component Responsibilities
| Component | Role |
|---|---|
| Collector API | Validate, enrich, and batch-write events to ClickHouse |
| ClickHouse | Column-oriented storage for events and pre-aggregated metrics |
| Redis | Alert cooldown state, rate limiting, real-time counters |
| Grafana | Dashboards, trend charts, funnel panels, built-in alerting |
| Sentry (OSS) | Error tracking with stack traces, issue grouping, release tracking |
1. ClickHouse Schema
Event Table
CREATE DATABASE IF NOT EXISTS analytics;
CREATE TABLE analytics.events (
-- Event data
event LowCardinality(String),
properties String, -- JSON string
-- User context
user_id String,
session_id String,
-- Page context
page String,
referrer String,
-- Environment
device LowCardinality(String), -- mobile, tablet, desktop
browser LowCardinality(String),
os LowCardinality(String),
country LowCardinality(String),
app_version LowCardinality(String),
-- Timestamp
timestamp DateTime64(3)
) ENGINE = MergeTree()
PARTITION BY toYYYYMMDD(timestamp)
ORDER BY (event, page, timestamp);Pre-Aggregated Views
-- Hourly event counts (fast dashboard queries)
CREATE MATERIALIZED VIEW analytics.events_hourly_mv
ENGINE = SummingMergeTree()
PARTITION BY toYYYYMMDD(hour)
ORDER BY (event, page, device, country, hour)
AS
SELECT
event,
page,
device,
country,
toStartOfHour(timestamp) AS hour,
count() AS event_count,
uniqExact(user_id) AS unique_users,
uniqExact(session_id) AS unique_sessions
FROM analytics.events
GROUP BY event, page, device, country, hour;
-- Daily funnel steps (for conversion analysis)
CREATE MATERIALIZED VIEW analytics.funnel_daily_mv
ENGINE = AggregatingMergeTree()
PARTITION BY toYYYYMMDD(day)
ORDER BY (user_id, day)
AS
SELECT
user_id,
toDate(timestamp) AS day,
groupArrayState(event) AS events_state
FROM analytics.events
WHERE event IN ('page_view', 'product_click', 'add_to_cart', 'checkout_start', 'purchase')
GROUP BY user_id, day;Useful Query Patterns
-- Top events in the last 24 hours
SELECT event, count() AS total, uniqExact(user_id) AS users
FROM analytics.events
WHERE timestamp >= now() - INTERVAL 24 HOUR
GROUP BY event
ORDER BY total DESC
LIMIT 20;
-- Conversion funnel (window funnel within 30 minutes)
SELECT
countIf(level >= 1) AS step1_page_view,
countIf(level >= 2) AS step2_add_to_cart,
countIf(level >= 3) AS step3_purchase
FROM (
SELECT
user_id,
windowFunnel(1800)(timestamp, event='page_view', event='add_to_cart', event='purchase') AS level
FROM analytics.events
WHERE timestamp >= now() - INTERVAL 7 DAY
GROUP BY user_id
);
-- Hourly trend for a specific event
SELECT
toStartOfHour(timestamp) AS hour,
count() AS total,
uniqExact(user_id) AS users
FROM analytics.events
WHERE event = 'purchase'
AND timestamp >= now() - INTERVAL 7 DAY
GROUP BY hour
ORDER BY hour;
-- Extract JSON property for analysis
SELECT
JSONExtractString(properties, 'category') AS category,
count() AS total,
uniqExact(user_id) AS users
FROM analytics.events
WHERE event = 'product_click'
AND timestamp >= now() - INTERVAL 30 DAY
GROUP BY category
ORDER BY total DESC;2. Collector API
// server/index.ts
import express from 'express';
import { createClient } from '@clickhouse/client';
import geoip from 'geoip-lite';
const app = express();
const clickhouse = createClient({
url: process.env.CLICKHOUSE_URL ?? 'http://localhost:8123',
database: 'analytics',
});
// ── Buffer and batch write ──────────────────────────────────
let buffer: any[] = [];
const FLUSH_INTERVAL = 5000;
const MAX_BATCH = 1000;
async function flush() {
if (buffer.length === 0) return;
const batch = buffer.splice(0);
try {
await clickhouse.insert({
table: 'events',
values: batch,
format: 'JSONEachRow',
});
console.log(`Flushed ${batch.length} events`);
} catch (err) {
console.error('Flush failed, re-queuing:', err);
buffer.unshift(...batch); // Put back for retry
}
}
setInterval(flush, FLUSH_INTERVAL);
// ── Collector endpoint ──────────────────────────────────────
app.post('/api/events/collect', express.json(), (req, res) => {
const events = Array.isArray(req.body) ? req.body : [req.body];
const ip = req.headers['x-forwarded-for'] as string ?? req.ip;
const geo = geoip.lookup(ip);
for (const e of events) {
if (!e.event || typeof e.event !== 'string') continue;
buffer.push({
event: e.event,
properties: JSON.stringify(e.properties ?? {}),
user_id: e.context?.userId ?? '',
session_id: e.context?.sessionId ?? '',
page: e.context?.page ?? '',
referrer: e.context?.referrer ?? '',
device: e.context?.device ?? '',
browser: parseBrowser(req.headers['user-agent'] ?? ''),
os: parseOS(req.headers['user-agent'] ?? ''),
country: geo?.country ?? 'unknown',
app_version: e.context?.appVersion ?? '',
timestamp: new Date(e.context?.timestamp ?? Date.now()).toISOString(),
});
}
if (buffer.length >= MAX_BATCH) flush().catch(console.error);
res.status(204).end();
});
// ── Health check ────────────────────────────────────────────
app.get('/health', async (_, res) => {
try {
await clickhouse.query({ query: 'SELECT 1', format: 'JSON' });
res.json({ status: 'ok', buffer: buffer.length });
} catch {
res.status(503).json({ status: 'error' });
}
});
app.listen(Number(process.env.PORT ?? 3001), () => {
console.log(`Collector listening on :${process.env.PORT ?? 3001}`);
});
// ── Helpers ─────────────────────────────────────────────────
function parseBrowser(ua: string): string {
if (/Chrome/i.test(ua) && !/Edge|OPR/i.test(ua)) return 'Chrome';
if (/Firefox/i.test(ua)) return 'Firefox';
if (/Safari/i.test(ua) && !/Chrome/i.test(ua)) return 'Safari';
if (/Edge/i.test(ua)) return 'Edge';
return 'Other';
}
function parseOS(ua: string): string {
if (/Windows/i.test(ua)) return 'Windows';
if (/Mac OS/i.test(ua)) return 'macOS';
if (/Linux/i.test(ua) && !/Android/i.test(ua)) return 'Linux';
if (/Android/i.test(ua)) return 'Android';
if (/iPhone|iPad/i.test(ua)) return 'iOS';
return 'Other';
}3. Grafana Dashboards
Grafana connects directly to ClickHouse and provides trend charts, tables, and alerting — all without custom UI code.
Data Source Configuration
# grafana/provisioning/datasources/clickhouse.yaml
apiVersion: 1
datasources:
- name: ClickHouse
type: grafana-clickhouse-datasource
url: http://clickhouse:8123
jsonData:
defaultDatabase: analytics
username: default
secureJsonData:
password: ${CLICKHOUSE_PASSWORD}Dashboard Panels
All panels below are configured in Grafana UI. The SQL queries are entered directly into each panel.
Event Volume Trend
-- Panel type: Time series
-- Grafana variables: $event, $timeFilter (auto)
SELECT
$__timeInterval(timestamp) AS time,
count() AS total,
uniqExact(user_id) AS unique_users
FROM analytics.events
WHERE $__timeFilter(timestamp)
AND event = '$event'
GROUP BY time
ORDER BY timeThis chart automatically responds to Grafana's time range picker (top right). Users can select last 1 hour, 6 hours, 24 hours, 7 days, 30 days, or any custom range.
Top Events Table
-- Panel type: Table
SELECT
event,
count() AS total,
uniqExact(user_id) AS unique_users,
uniqExact(session_id) AS sessions
FROM analytics.events
WHERE $__timeFilter(timestamp)
GROUP BY event
ORDER BY total DESC
LIMIT 20Conversion Funnel
-- Panel type: Bar gauge
SELECT
countIf(level >= 1) AS "1. Page View",
countIf(level >= 2) AS "2. Add to Cart",
countIf(level >= 3) AS "3. Checkout",
countIf(level >= 4) AS "4. Purchase"
FROM (
SELECT
user_id,
windowFunnel(1800)(
timestamp,
event = 'page_view',
event = 'add_to_cart',
event = 'checkout_start',
event = 'purchase'
) AS level
FROM analytics.events
WHERE $__timeFilter(timestamp)
GROUP BY user_id
)Device and Country Breakdown
-- Panel type: Pie chart (device)
SELECT device, count() AS total
FROM analytics.events
WHERE $__timeFilter(timestamp) AND event = '$event'
GROUP BY device
-- Panel type: Geo map (country)
SELECT country, count() AS total
FROM analytics.events
WHERE $__timeFilter(timestamp) AND event = '$event'
GROUP BY countryError Rate Over Time
-- Panel type: Time series
SELECT
$__timeInterval(timestamp) AS time,
countIf(event = 'error') AS errors,
count() AS total,
countIf(event = 'error') / count() * 100 AS error_rate
FROM analytics.events
WHERE $__timeFilter(timestamp)
GROUP BY time
ORDER BY timeDashboard Variables
Configure Grafana template variables so users can filter by event, page, or device:
| Variable | Type | Query |
|---|---|---|
$event | Query | SELECT DISTINCT event FROM analytics.events WHERE timestamp >= now() - INTERVAL 7 DAY ORDER BY event |
$page | Query | SELECT DISTINCT page FROM analytics.events WHERE timestamp >= now() - INTERVAL 7 DAY ORDER BY page |
$device | Custom | mobile, tablet, desktop |
4. Grafana Alerting
Grafana's built-in alerting evaluates queries on a schedule and notifies via Slack, email, webhook, etc.
Contact Points
# grafana/provisioning/alerting/contacts.yaml
apiVersion: 1
contactPoints:
- orgId: 1
name: slack-perf
receivers:
- uid: slack-1
type: slack
settings:
url: ${SLACK_WEBHOOK_URL}
title: |
{{ template "default.title" . }}
text: |
{{ template "default.message" . }}
- orgId: 1
name: wecom
receivers:
- uid: wecom-1
type: webhook
settings:
url: https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=${WECOM_KEY}
httpMethod: POST
body: |
{
"msgtype": "markdown",
"markdown": {
"content": "**{{ .CommonLabels.alertname }}**\n> {{ .CommonAnnotations.summary }}"
}
}Alert Rules
# grafana/provisioning/alerting/rules.yaml
apiVersion: 1
groups:
- orgId: 1
name: business-alerts
folder: Business Monitoring
interval: 1m
rules:
# Purchase volume drop
- uid: purchase-drop
title: Purchase volume critically low
condition: C
data:
- refId: A
relativeTimeRange:
from: 3600
to: 0
datasourceUid: clickhouse
model:
rawSql: |
SELECT count() AS value
FROM analytics.events
WHERE event = 'purchase'
AND timestamp >= now() - INTERVAL 1 HOUR
- refId: C
datasourceUid: __expr__
model:
type: threshold
expression: A
conditions:
- evaluator:
type: lt
params: [10]
for: 5m
labels:
severity: critical
annotations:
summary: "Purchase count dropped below 10/hour (current: {{ $values.A }})"
# Error spike
- uid: error-spike
title: Error event spike
condition: C
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: clickhouse
model:
rawSql: |
SELECT count() AS value
FROM analytics.events
WHERE event = 'error'
AND timestamp >= now() - INTERVAL 5 MINUTE
- refId: C
datasourceUid: __expr__
model:
type: threshold
expression: A
conditions:
- evaluator:
type: gt
params: [500]
for: 0s
labels:
severity: critical
annotations:
summary: "Error events exceeded 500 in 5 minutes (current: {{ $values.A }})"
# Conversion rate drop
- uid: conversion-drop
title: Cart-to-purchase conversion low
condition: C
data:
- refId: A
relativeTimeRange:
from: 3600
to: 0
datasourceUid: clickhouse
model:
rawSql: |
SELECT
countIf(event = 'purchase') / nullIf(countIf(event = 'add_to_cart'), 0) * 100 AS value
FROM analytics.events
WHERE timestamp >= now() - INTERVAL 1 HOUR
- refId: C
datasourceUid: __expr__
model:
type: threshold
expression: A
conditions:
- evaluator:
type: lt
params: [2]
for: 10m
labels:
severity: warning
annotations:
summary: "Cart-to-purchase conversion dropped below 2% (current: {{ $values.A }}%)"5. Sentry (Self-Hosted Error Tracking)
Add self-hosted Sentry for full error tracking with stack traces, issue grouping, and release tracking.
Sentry Client Setup
// lib/sentry.ts
import * as Sentry from '@sentry/react';
Sentry.init({
dsn: process.env.NEXT_PUBLIC_SENTRY_DSN, // Points to your self-hosted instance
environment: process.env.NODE_ENV,
release: process.env.NEXT_PUBLIC_APP_VERSION,
integrations: [
Sentry.browserTracingIntegration(),
Sentry.replayIntegration({ maskAllText: true }),
],
tracesSampleRate: 0.2,
replaysSessionSampleRate: 0.1,
replaysOnErrorSampleRate: 1.0,
});Connecting Events to Errors
When an error fires, send a matching event to ClickHouse so you can correlate business events with errors.
// lib/sentry-integration.ts
import * as Sentry from '@sentry/react';
import { tracker } from './tracker';
Sentry.addEventProcessor((event) => {
if (event.exception) {
tracker.track('error', {
type: 'js_exception',
message: event.exception.values?.[0]?.value ?? 'unknown',
sentry_event_id: event.event_id ?? '',
});
}
return event;
});6. Docker Compose Deployment
# docker-compose.yml
services:
clickhouse:
image: clickhouse/clickhouse-server:24
ports:
- "8123:8123"
volumes:
- ch-data:/var/lib/clickhouse
- ./clickhouse/init.sql:/docker-entrypoint-initdb.d/init.sql
environment:
CLICKHOUSE_DB: analytics
CLICKHOUSE_USER: default
CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD}
redis:
image: redis:7-alpine
ports:
- "6379:6379"
volumes:
- redis-data:/data
command: redis-server --appendonly yes
collector:
build:
context: ./collector
dockerfile: Dockerfile
ports:
- "3001:3001"
environment:
CLICKHOUSE_URL: http://clickhouse:8123
REDIS_URL: redis://redis:6379
PORT: 3001
depends_on:
- clickhouse
- redis
grafana:
image: grafana/grafana:11
ports:
- "3000:3000"
volumes:
- grafana-data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
environment:
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_PASSWORD}
GF_INSTALL_PLUGINS: grafana-clickhouse-datasource
CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD}
SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL}
depends_on:
- clickhouse
volumes:
ch-data:
redis-data:
grafana-data:Adding Self-Hosted Sentry
Sentry provides an official self-hosted distribution. It runs as a separate Docker Compose stack.
# Clone the Sentry self-hosted repo
git clone https://github.com/getsentry/self-hosted.git sentry
cd sentry
# Run the installer (sets up Kafka, PostgreSQL, Redis, Snuba, etc.)
./install.sh
# Start Sentry
docker compose up -d
# Sentry will be available at http://localhost:9000
# Create your project and grab the DSNStart the Platform
# 1. Create environment file
cat > .env << 'EOL'
CLICKHOUSE_PASSWORD=your-ch-password
GRAFANA_PASSWORD=your-grafana-password
SLACK_WEBHOOK_URL=https://hooks.slack.com/services/xxx/yyy/zzz
EOL
# 2. Initialize ClickHouse schema
cat > clickhouse/init.sql << 'EOSQL'
CREATE DATABASE IF NOT EXISTS analytics;
CREATE TABLE analytics.events (
event LowCardinality(String),
properties String,
user_id String,
session_id String,
page String,
referrer String,
device LowCardinality(String),
browser LowCardinality(String),
os LowCardinality(String),
country LowCardinality(String),
app_version LowCardinality(String),
timestamp DateTime64(3)
) ENGINE = MergeTree()
PARTITION BY toYYYYMMDD(timestamp)
ORDER BY (event, page, timestamp);
EOSQL
# 3. Start all services
docker compose up -d
# 4. Verify
docker compose ps
# clickhouse running 0.0.0.0:8123->8123/tcp
# redis running 0.0.0.0:6379->6379/tcp
# collector running 0.0.0.0:3001->3001/tcp
# grafana running 0.0.0.0:3000->3000/tcpNginx Reverse Proxy
# /etc/nginx/conf.d/monitoring.conf
upstream collector {
server 127.0.0.1:3001;
}
upstream grafana {
server 127.0.0.1:3000;
}
server {
listen 443 ssl;
server_name monitoring.internal.example.com;
ssl_certificate /etc/ssl/certs/monitoring.crt;
ssl_certificate_key /etc/ssl/private/monitoring.key;
# Collector — accessed by browser SDK (public)
location /api/events/collect {
proxy_pass http://collector;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
limit_req zone=events burst=200 nodelay;
}
# Grafana — internal only
location / {
proxy_pass http://grafana;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
# Restrict to internal network
allow 10.0.0.0/8;
allow 172.16.0.0/12;
deny all;
}
}
limit_req_zone $binary_remote_addr zone=events:10m rate=100r/s;7. Data Retention
-- Drop partitions older than 90 days (run daily via cron)
ALTER TABLE analytics.events
DROP PARTITION WHERE toYYYYMMDD(timestamp) < toYYYYMMDD(now() - INTERVAL 90 DAY);
-- Keep aggregated views longer (180 days)
ALTER TABLE analytics.events_hourly_mv
DROP PARTITION WHERE toYYYYMMDD(hour) < toYYYYMMDD(now() - INTERVAL 180 DAY);# Crontab — run daily at 3 AM
0 3 * * * clickhouse-client --query "ALTER TABLE analytics.events DROP PARTITION WHERE toYYYYMMDD(timestamp) < toYYYYMMDD(now() - INTERVAL 90 DAY)"
0 3 * * * clickhouse-client --query "ALTER TABLE analytics.events_hourly_mv DROP PARTITION WHERE toYYYYMMDD(hour) < toYYYYMMDD(now() - INTERVAL 180 DAY)"Best Practices
Self-Hosted Business Monitoring Guidelines
- Use Grafana over custom UI — it provides time-range pickers, alerting, and dashboards for free; no frontend code to maintain
- Pre-aggregate with materialized views — hourly summaries make dashboard queries instant even with billions of raw events
- Separate raw events from aggregates — keep raw events for 90 days, keep hourly aggregates for 180+ days
- Add GeoIP enrichment at ingestion — enrich with country at the collector, not at query time
- Rate-limit the collector endpoint — protect against SDK bugs or replay attacks flooding storage
- Provision dashboards and alerts as code — store Grafana provisioning YAML in your repo for reproducibility
- Pair with self-hosted Sentry — business events show what happened; Sentry shows why (stack traces, replays)
- Test alert rules in staging first — misconfigured thresholds cause alert fatigue or missed incidents