Sample configuration for AWS EC2 handling 250k Obcerv entities and 100k metrics/sec (large) with NGINX Ingress controller

Download this sample AWS EC2 handling 250k Obcerv entities and 100k metrics/sec (large) configuration provided by ITRS.

# Example Obcerv configuration for AWS EC2 handling 250,000 Obcerv entities and 100k metrics/sec.
#
# Nodes:
# - (3) m5.8xlarge (32 CPU, 128GiB Memory) for Timescale
# - (6) c5.4xlarge (16 CPU, 32GiB Memory) for all other workloads
#
# The resource requests for Timescale total 48 cores and 360GiB memory.
# The resource requests for the other workloads total ~58 cores and ~138GiB memory.
# These totals include Linkerd resources.
#
# Disk requirements:
# - Timescale:
#   - 4 x 2 TiB timeseries data disk for each replica (x3)
#   - 200 GiB data disk for each replica (x3)
#   - 300 GiB WAL disk for each replica (x3)
# - Kafka: 400 GiB for each replica (x3)
# - Loki: 30 GiB for each replica (x1)
# - Zookeeper: 1 GiB for each replica (x3)
# - etcd: 1 GiB for each replica (x3)
# - Downsampled Metrics:
#   - Raw: 5 GiB for each replica (x4)
#   - Bucketed: 5 GiB for each replica (x3)
#
# The configuration references a default storage class named `gp3` which uses EBS gp3 volumes. This storage class should
# be configured with the default minimum gp3 settings of 3000 IOPS and 125 MiB/s throughput.
#
# The configuration also references a storage class named `gp3-timescale` which uses EBS gp3 volumes, but with
# higher provisioned performance for the WAL volumes. This storage class should be configured with 5000 IOPS and
# 200 MiB/s throughput.
#
# You can create these classes or change the config to use classes of your own, but they should be similar in performance.
#
# This configuration is based upon a certain number of Obcerv entities, average metrics per entity, and
# average metrics collection interval. The following function can be used to figure out what type of load to expect:
#
# metrics/sec = (Obcerv entities * metrics/entity) / average metrics collection interval
#
# In this example configuration, we have the following:
#
# 100,000 metrics/sec = (250,000 Obcerv entities * 4 metrics/entity) / 10 seconds average metrics collection interval
#

defaultStorageClass: "gp3"
apps:
  externalHostname: "obcerv.mydomain.internal"
  ingress:
    annotations:
      kubernetes.io/ingress.class: "nginx"
      nginx.org/mergeable-ingress-type: "master"
ingestion:
  externalHostname: "obcerv-ingestion.mydomain.internal"
  replicas: 3
  ingress:
    annotations:
      kubernetes.io/ingress.class: "nginx"
      nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
  resources:
    requests:
      memory: "512Mi"
      cpu: "500m"
    limits:
      memory: "768Mi"
      cpu: "1"
mesh:
  resourcesHot:
    config.linkerd.io/proxy-cpu-request: "80m"
    config.linkerd.io/proxy-cpu-limit: "1"
    config.linkerd.io/proxy-memory-request: "48Mi"
    config.linkerd.io/proxy-memory-limit: "192Mi"
iam:
  ingress:
    annotations:
      kubernetes.io/ingress.class: "nginx"
      nginx.org/mergeable-ingress-type: "minion"
zookeeper:
  replicas: 3
  resources:
    requests:
      memory: "256Mi"
      cpu: "200m"
    limits:
      memory: "512Mi"
      cpu: "200m"
kafka:
  replicas: 3
  diskSize: "400Gi"
  defaultPartitions: 24
  consumer:
    fetchMaxWaitMs: 250
    fetchMinBytes: 524288
  resources:
    requests:
      memory: "12Gi"
      cpu: "3"
    limits:
      memory: "12Gi"
      cpu: "4"
timescale:
  clusterSize: 3
  dataDiskSize: "200Gi"
  timeseriesDiskCount: 4
  timeseriesDiskSize: "2Ti"
  walDiskSize: "300Gi"
  walStorageClass: "gp3-timescale"
  resources:
    requests:
      memory: "116Gi"
      cpu: "16"
    limits:
      memory: "116Gi"
      cpu: "16"
  nodeSelector:
    instancegroup: timescale-nodes
  tolerations:
  - key: dedicated
    operator: Equal
    value: timescale-nodes
    effect: NoSchedule
  retention:
    entity_attributes:
      chunkSize: 2d
      retention: 1y
    metrics:
      chunkSize: 8h
      retention: 30d
    metrics_5m:
      chunkSize: 1d
      retention: 90d
    metrics_1h:
      chunkSize: 5d
      retention: 180d
    metrics_1d:
      chunkSize: 20d
      retention: 1y
    statuses:
      chunkSize: 7d
      retention: 1y
    signal_details:
      chunkSize: 1d
      retention: 30d
loki:
  diskSize: "30Gi"
  ingestionBurstSize: 12
  ingestionRateLimit: 8
  maxPayloadSize: 8388608
  resources:
    requests:
      memory: "1Gi"
      cpu: "500m"
    limits:
      memory: "1Gi"
      cpu: "1"
sinkd:
  replicas: 3
  rawReplicas: 6
  jvmOpts: "-Xms1536M -Xmx1536M -XX:MaxDirectMemorySize=100M"
  rawJvmOpts: "-Xms1G -Xmx1G -XX:MaxDirectMemorySize=100M"
  entityCacheMaxSize: 350000
  timeseriesCacheMaxSize: 700000
  evCacheMaxSize: 2000000
  metrics:
    maxPollRecords: 10000
  loki:
    maxPollRecords: 5000
  resources:
    requests:
      memory: "2Gi"
      cpu: "250m"
    limits:
      memory: "2Gi"
      cpu: "1"
  rawResources:
    requests:
      memory: "1536Mi"
      cpu: "500m"
    limits:
      memory: "1536Mi"
      cpu: "1"
platformd:
  replicas: 2
  resources:
    requests:
      memory: "1536Mi"
      cpu: "1"
    limits:
      memory: "2Gi"
      cpu: "1500m"
dpd:
  replicas: 3
  jvmOpts: "-Xmx5G"
  kafkaConsumerMaxPollRecords: 10000
  maxEntitySerdeCacheEntries: 75000
  entitiesInMemoryCacheSizeMb: 256
  metricsMultiplexer:
    maxFilterResultCacheSize: 1000000
    maxConcurrentOps: 1000
    localParallelism: 8
  selfMonitoringThresholds:
    entities_partition_lag_warn: 100000
    entities_partition_lag_critical: 500000
    metrics_partition_lag_warn: 1000000
    metrics_partition_lag_critical: 5000000
  resources:
    requests:
      memory: "6Gi"
      cpu: "2"
    limits:
      memory: "6500Mi"
      cpu: "3"
metricForecastd:
  resources:
    requests:
      memory: "512Mi"
      cpu: "250m"
    limits:
      memory: "768Mi"
      cpu: "500m"
downsampledMetricsStream:
  replicas: 4
  bucketedReplicas: 4
  maxPollRecords: 10000
  jvmOpts: "-XX:InitialRAMPercentage=25 -XX:MaxRAMPercentage=25"
  bucketedJvmOpts: "-XX:InitialRAMPercentage=50 -XX:MaxRAMPercentage=50"
  resources:
    requests:
      memory: "3Gi"
      cpu: "1500m"
    limits:
      memory: "3Gi"
      cpu: "2"
  bucketedResources:
    requests:
      memory: "3Gi"
      cpu: "1"
    limits:
      memory: "3Gi"
      cpu: "3"
  rocksdb:
    raw:
      indexAndFilterRatio: 0.5
      memoryMib: 500
      writeBufferMib: 16
      writeBufferRatio: 0.25
    bucketed:
      indexAndFilterRatio: 0.5
      memoryMib: 200
      writeBufferMib: 16
      writeBufferRatio: 0.25
entityStream:
  intermediate:
    storedEntitiesCacheSize: 10000
    jvmOpts: "-XX:MaxRAMPercentage=25"
    replicas: 4
    resources:
      requests:
        memory: "1536Mi"
        cpu: "750m"
      limits:
        memory: "2Gi"
        cpu: "1"
    rocksdb:
      memoryMib: 300
  final:
    replicas: 6
    storedEntitiesCacheSize: 10000
    resources:
      requests:
        memory: "1536Mi"
        cpu: "1"
      limits:
        memory: "2560Mi"
        cpu: "1500m"
signalsStream:
  resources:
    requests:
      memory: "768Mi"
      cpu: "150m"
    limits:
      memory: "1536Mi"
      cpu: "300m"
etcd:
  replicas: 3
collection:
  daemonSet:
    tolerations:
    # must match the tainted Timescale nodes setting
    - key: dedicated
      operator: Equal
      value: timescale-nodes
      effect: NoSchedule
  metrics:
    resources:
      requests:
        memory: "768Mi"
        cpu: "200m"
      limits:
        memory: "1Gi"
        cpu: "500m"
["Obcerv"] ["User Guide", "Technical Reference"]

Was this topic helpful?