×
Sample configuration for AWS EC2 handling 250k Obcerv entities and 100k metrics/sec (large) with NGINX Ingress controller
Download this sample AWS EC2 handling 250k Obcerv entities and 100k metrics/sec (large) configuration provided by ITRS.
# Example Obcerv configuration for AWS EC2 handling 250k Obcerv entities and 100k metrics/sec, and 100k OpenTelemetry
# spans/sec (pre-sampling).
#
# Nodes:
# - (3) m5.8xlarge (32 CPU, 128GiB Memory) for Timescale
# - (6) c5.4xlarge (16 CPU, 32GiB Memory) for all other workloads
#
# The resource requests for Timescale total 48 cores and 360GiB memory.
# The resource requests for the other workloads total ~61 cores and ~142GiB memory.
# These totals include Linkerd resources.
#
# Disk requirements:
# - Timescale:
# - 4 x 2 TiB timeseries data disk for each replica (x3)
# - 200 GiB data disk for each replica (x3)
# - 300 GiB WAL disk for each replica (x3)
# - Kafka: 400 GiB for each replica (x3)
# - Loki: 30 GiB for each replica (x1)
# - Zookeeper: 1 GiB for each replica (x3)
# - etcd: 1 GiB for each replica (x3)
# - Downsampled Metrics:
# - Raw: 5 GiB for each replica (x4)
# - Bucketed: 5 GiB for each replica (x3)
#
# The configuration references a default storage class named `gp3` which uses EBS gp3 volumes. This storage class should
# be configured with the default minimum gp3 settings of 3000 IOPS and 125 MiB/s throughput.
#
# The configuration also references a storage class named `gp3-timescale` which uses EBS gp3 volumes, but with
# higher provisioned performance for the WAL volumes. This storage class should be configured with 5000 IOPS and
# 200 MiB/s throughput.
#
# You can create these classes or change the config to use classes of your own, but they should be similar in performance.
#
# This configuration is based upon a certain number of Obcerv entities, average metrics per entity, and
# average metrics collection interval. The following function can be used to figure out what type of load to expect:
#
# metrics/sec = (Obcerv entities * metrics/entity) / average metrics collection interval
#
# In this example configuration, we have the following:
#
# 100,000 metrics/sec = (250,000 Obcerv entities * 4 metrics/entity) / 10 seconds average metrics collection interval
#
# NOTE: Ingestion, storage, and retrieval of OpenTelemetry spans is a beta feature.
#
# Additionally, the configuration is based upon a certain number of OpenTelemetry spans per second that are sampled
# based upon the following rules:
# - Error traces are always sampled
# - Target sampling probability per endpoint (corresponds to the name of the root span) is 0.01
# - Target sampling rate / second / endpoint (corresponds to the name of the root span) is 0.5
# - Root span duration outlier quantile is 0.95. The durations of all root spans are tracked and used to make guesses about
# abnormally long spans
#
defaultStorageClass: "gp3"
apps:
externalHostname: "obcerv.mydomain.internal"
ingress:
annotations:
kubernetes.io/ingress.class: "nginx"
nginx.org/mergeable-ingress-type: "master"
ingestion:
externalHostname: "obcerv-ingestion.mydomain.internal"
replicas: 3
ingress:
annotations:
kubernetes.io/ingress.class: "nginx"
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "768Mi"
cpu: "1"
# NOTE: OpenTelemetry Traces ingestion is a beta feature and resources may need to be adjusted based on ingestion rate.
traces:
jvmOpts: "-Xms3G -Xmx4G"
resources:
requests:
memory: "4Gi"
cpu: "3"
limits:
memory: "5Gi"
cpu: "4"
mesh:
resourcesHot:
config.linkerd.io/proxy-cpu-request: "80m"
config.linkerd.io/proxy-cpu-limit: "1"
config.linkerd.io/proxy-memory-request: "48Mi"
config.linkerd.io/proxy-memory-limit: "192Mi"
iam:
ingress:
annotations:
kubernetes.io/ingress.class: "nginx"
nginx.org/mergeable-ingress-type: "minion"
kafka:
replicas: 3
diskSize: "400Gi"
resources:
requests:
memory: "12Gi"
cpu: "3"
limits:
memory: "12Gi"
cpu: "4"
timescale:
clusterSize: 3
dataDiskSize: "200Gi"
timeseriesDiskCount: 4
timeseriesDiskSize: "2Ti"
walDiskSize: "300Gi"
walStorageClass: "gp3-timescale"
resources:
requests:
memory: "116Gi"
cpu: "16"
limits:
memory: "116Gi"
cpu: "16"
nodeSelector:
instancegroup: timescale-nodes
tolerations:
- key: dedicated
operator: Equal
value: timescale-nodes
effect: NoSchedule
retention:
entity_attributes:
chunkSize: 2d
retention: 1y
metrics:
chunkSize: 8h
retention: 30d
metrics_5m:
chunkSize: 1d
retention: 90d
metrics_1h:
chunkSize: 5d
retention: 180d
metrics_1d:
chunkSize: 20d
retention: 1y
statuses:
chunkSize: 7d
retention: 1y
signal_details:
chunkSize: 1d
retention: 30d
traces:
chunkSize: 4h
retention: 5d
# The retention for the traces table applies to this table.
span_links:
chunkSize: 1d
loki:
diskSize: "30Gi"
ingestionBurstSize: 12
ingestionRateLimit: 8
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1"
sinkd:
replicas: 3
rawReplicas: 6
jvmOpts: "-Xms1536M -Xmx1536M -XX:MaxDirectMemorySize=100M"
rawJvmOpts: "-Xms1G -Xmx1G -XX:MaxDirectMemorySize=100M"
timeseriesCacheMaxSize: 700000
evCacheMaxSize: 2000000
metrics:
consumerProperties:
receive.buffer.bytes: 131072
fetch.max.bytes: 5242880
max.partition.fetch.bytes: 524288
max.poll.records: 10000
loki:
consumerProperties:
fetch.max.bytes: 52428800
max.partition.fetch.bytes: 1048576
receive.buffer.bytes: 65536
max.poll.records: 5000
dsMetrics:
consumerProperties:
fetch.max.bytes: 52428800
max.partition.fetch.bytes: 1048576
receive.buffer.bytes: 65536
entities:
consumerProperties:
fetch.max.bytes: 52428800
max.partition.fetch.bytes: 1048576
receive.buffer.bytes: 65536
signals:
consumerProperties:
fetch.max.bytes: 52428800
max.partition.fetch.bytes: 1048576
receive.buffer.bytes: 65536
resources:
requests:
memory: "2Gi"
cpu: "250m"
limits:
memory: "2Gi"
cpu: "1"
rawResources:
requests:
memory: "1536Mi"
cpu: "500m"
limits:
memory: "1536Mi"
cpu: "1"
platformd:
replicas: 2
resources:
requests:
memory: "1536Mi"
cpu: "1"
limits:
memory: "2Gi"
cpu: "1500m"
dpd:
replicas: 3
jvmOpts: "-Xmx5G"
maxEntitySerdeCacheEntries: 75000
consumerProperties:
max.poll.records: 10000
fetch.min.bytes: 524288
entitiesInMemoryCacheSizeMb: 256
metricsMultiplexer:
maxFilterResultCacheSize: 1000000
maxConcurrentOps: 1000
localParallelism: 8
selfMonitoringThresholds:
entities_partition_lag_warn: 100000
entities_partition_lag_critical: 500000
metrics_partition_lag_warn: 1000000
metrics_partition_lag_critical: 5000000
resources:
requests:
memory: "6Gi"
cpu: "2"
limits:
memory: "6500Mi"
cpu: "3"
metricForecastd:
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "768Mi"
cpu: "500m"
downsampledMetricsStream:
replicas: 4
bucketedReplicas: 4
jvmOpts: "-XX:InitialRAMPercentage=25 -XX:MaxRAMPercentage=25"
bucketedJvmOpts: "-XX:InitialRAMPercentage=50 -XX:MaxRAMPercentage=50"
consumerProperties:
fetch.max.bytes: 52428800
fetch.min.bytes: 524288
max.partition.fetch.bytes: 1048576
max.poll.records: 10000
receive.buffer.bytes: 65536
resources:
requests:
memory: "3Gi"
cpu: "1500m"
limits:
memory: "3Gi"
cpu: "2"
bucketedConsumerProperties:
fetch.max.bytes: 52428800
fetch.min.bytes: 524288
max.partition.fetch.bytes: 1048576
max.poll.records: 10000
receive.buffer.bytes: 65536
bucketedResources:
requests:
memory: "3Gi"
cpu: "1"
limits:
memory: "3Gi"
cpu: "3"
rocksdb:
raw:
indexAndFilterRatio: 0.5
memoryMib: 500
writeBufferMib: 16
writeBufferRatio: 0.25
bucketed:
indexAndFilterRatio: 0.5
memoryMib: 200
writeBufferMib: 16
writeBufferRatio: 0.25
entityStream:
intermediate:
storedEntitiesCacheSize: 10000
jvmOpts: "-XX:MaxRAMPercentage=25"
replicas: 4
consumerProperties:
fetch.max.bytes: 52428800
max.partition.fetch.bytes: 1048576
receive.buffer.bytes: 65536
resources:
requests:
memory: "1536Mi"
cpu: "750m"
limits:
memory: "2Gi"
cpu: "1"
rocksdb:
memoryMib: 300
final:
replicas: 6
storedEntitiesCacheSize: 10000
consumerProperties:
fetch.max.bytes: 52428800
max.partition.fetch.bytes: 1048576
receive.buffer.bytes: 65536
resources:
requests:
memory: "1536Mi"
cpu: "1"
limits:
memory: "2560Mi"
cpu: "1500m"
signalsStream:
consumerProperties:
fetch.max.bytes: 52428800
max.partition.fetch.bytes: 1048576
receive.buffer.bytes: 65536
resources:
requests:
memory: "768Mi"
cpu: "150m"
limits:
memory: "1536Mi"
cpu: "300m"
etcd:
replicas: 3
collection:
daemonSet:
tolerations:
# must match the tainted Timescale nodes setting
- key: dedicated
operator: Equal
value: timescale-nodes
effect: NoSchedule
metrics:
resources:
requests:
memory: "768Mi"
cpu: "200m"
limits:
memory: "1Gi"
cpu: "500m"
["Obcerv"]
["User Guide", "Technical Reference"]