×
Sample configuration for AWS EC2 handling 100k metrics/sec (large) with NGINX Ingress controller
Download this sample AWS EC2 handling 100k metrics/sec (large) configuration provided by ITRS.
# Example Obcerv configuration for AWS EC2 handling 100k metrics/sec.
#
# Nodes:
# - (3) m5.8xlarge (32 CPU, 128GiB Memory) for Timescale
# - (5) c5.4xlarge (16 CPU, 32GiB Memory) for all other workloads
#
# The resource requests for Timescale total 48 cores and 360GiB memory.
# The resource requests for the other workloads total ~63 cores and ~113GiB memory.
# These totals include Linkerd resources.
#
# Disk requirements:
# - Timescale:
# - 16 TiB data disk for each replica (x3)
# - 75 GiB WAL disk for each replica (x3)
# - Kafka: 400 GiB for each replica (x3)
# - Loki: 30 GiB for each replica (x1)
# - Zookeeper: 1 GiB for each replica (x3)
# - etcd: 1 GiB for each replica (x3)
# - Downsampled Metrics:
# - Raw: 5 GiB for each replica (x6)
# - Bucketed: 5 GiB for each replica (x6)
#
# The configuration references a StorageClass named `io1-25` which uses io1 with 25 iopsPerGB - you can create
# this class or change the config to use a class of your own, but it should be similar in performance.
#
# This configuration is based upon a certain number of Obcerv entities, average metrics per entity, and
# average metrics collection interval. The following function can be used to figure out what type of load to expect:
#
# metrics/sec = (Obcerv entities * metrics/entity) / average metrics collection interval
#
# In this example configuration, we have the following:
#
# 100,000 metrics/sec = (250,000 Obcerv entities * 4 metrics/entity) / 10 seconds average metrics collection interval
#
defaultStorageClass: "gp2"
apps:
externalHostname: "obcerv.mydomain.internal"
ingress:
annotations:
kubernetes.io/ingress.class: "nginx"
nginx.org/mergeable-ingress-type: "master"
ingestion:
externalHostname: "obcerv-ingestion.mydomain.internal"
replicas: 3
ingress:
annotations:
kubernetes.io/ingress.class: "nginx"
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "768Mi"
cpu: "1"
iam:
ingress:
annotations:
kubernetes.io/ingress.class: "nginx"
nginx.org/mergeable-ingress-type: "minion"
zookeeper:
replicas: 3
resources:
requests:
memory: "256Mi"
cpu: "200m"
limits:
memory: "512Mi"
cpu: "200m"
kafka:
replicas: 3
diskSize: "400Gi"
storageClass: "io1-25"
defaultPartitions: 12
consumer:
fetchMaxWaitMs: 2000
fetchMinBytes: 8388608
resources:
requests:
memory: "12Gi"
cpu: "3"
limits:
memory: "12Gi"
cpu: "4"
timescale:
clusterSize: 3
dataDiskSize: "100Gi"
dataStorageClass: "io1-25"
timeseriesDiskCount: 4
timeseriesDiskSize: "4Ti" # Max disk size for AWS io1 is 16Ti
timeseriesStorageClass: "io1-25"
walDiskSize: "75Gi"
walStorageClass: "io1-25"
resources:
requests:
memory: "120Gi"
cpu: "16"
limits:
memory: "120Gi"
cpu: "16"
compressAfter: 1h
nodeSelector:
instancegroup: timescale-nodes
tolerations:
- key: dedicated
operator: Equal
value: timescale-nodes
effect: NoSchedule
retention:
entity_attributes:
chunkSize: 2d
metrics:
chunkSize: 20m
retention: 20d
metrics_5m:
chunkSize: 1h
retention: 30d
metrics_15m:
chunkSize: 2h
retention: 60d
metrics_1h:
chunkSize: 6h
retention: 90d
metrics_3h:
chunkSize: 12h
retention: 120d
metrics_12h:
chunkSize: 2d
retention: 180d
metrics_1d:
chunkSize: 3d
retention: 1y
statuses:
chunkSize: 7d
retention: 1y
loki:
diskSize: "30Gi"
storageClass: "io1-25"
ingestionBurstSize: 12
ingestionRateLimit: 8
maxPayloadSize: 8388608
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1"
sinkd:
replicas: 4
rawReplicas: 4
jvmOpts: "-Xms768M -Xmx768M -XX:MaxDirectMemorySize=100M"
entityCacheMaxSize: 350000
timeseriesCacheMaxSize: 700000
resources:
requests:
memory: "1152Mi"
cpu: "250m"
limits:
memory: "1152Mi"
cpu: "400m"
rawResources:
requests:
memory: "1Gi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "400m"
platformd:
replicas: 2
resources:
requests:
memory: "1536Mi"
cpu: "1"
limits:
memory: "2Gi"
cpu: "1500m"
dpd:
replicas: 3
jvmOpts: "-Xms3584M -Xmx3584M"
resources:
requests:
memory: "3600Mi"
cpu: "6"
limits:
memory: "4Gi"
cpu: "8"
metricForecastd:
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "768Mi"
cpu: "500m"
downsampledMetricsStream:
replicas: 3
storageClass: "io1-25"
bucketedReplicas: 4
rawRocksdb:
totalOffHeapMemory: 268435456
indexFilterRatio: 0.25
totalMemTableMemory: 201326592
blockSize: 32768
writeBufferSize: 33554432
bucketedRocksdb:
totalOffHeapMemory: 33554432
indexFilterRatio: 0.25
totalMemTableMemory: 25165824
blockSize: 16384
writeBufferSize: 8388608
resources:
requests:
memory: "3Gi"
cpu: "1"
limits:
memory: "3Gi"
cpu: "1500m"
bucketedResources:
requests:
memory: "4Gi"
cpu: "1500m"
limits:
memory: "4Gi"
cpu: "2"
entityStream:
intermediate:
replicas: 2
resources:
requests:
memory: "1Gi"
cpu: "1"
limits:
memory: "1536Mi"
cpu: "1500m"
final:
resources:
requests:
memory: "512Mi"
cpu: "1"
limits:
memory: "2500Mi"
cpu: "1500m"
etcd:
replicas: 3
collection:
metrics:
resources:
requests:
memory: "768Mi"
cpu: "200m"
limits:
memory: "1Gi"
cpu: "250m"
["Obcerv"]
["User Guide", "Technical Reference"]