<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://all.docs.genesys.com/index.php?action=history&amp;feed=atom&amp;title=PEC-REP%2FCurrent%2FGIMPEGuide%2FGSPMetrics</id>
	<title>PEC-REP/Current/GIMPEGuide/GSPMetrics - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://all.docs.genesys.com/index.php?action=history&amp;feed=atom&amp;title=PEC-REP%2FCurrent%2FGIMPEGuide%2FGSPMetrics"/>
	<link rel="alternate" type="text/html" href="https://all.docs.genesys.com/index.php?title=PEC-REP/Current/GIMPEGuide/GSPMetrics&amp;action=history"/>
	<updated>2026-05-15T08:15:51Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.31.1</generator>
	<entry>
		<id>https://all.docs.genesys.com/index.php?title=PEC-REP/Current/GIMPEGuide/GSPMetrics&amp;diff=121455&amp;oldid=prev</id>
		<title>WikiSysop at 14:30, June 6, 2022</title>
		<link rel="alternate" type="text/html" href="https://all.docs.genesys.com/index.php?title=PEC-REP/Current/GIMPEGuide/GSPMetrics&amp;diff=121455&amp;oldid=prev"/>
		<updated>2022-06-06T14:30:46Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;table class=&quot;diff diff-contentalign-left&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #222; text-align: center;&quot;&gt;Revision as of 14:30, June 6, 2022&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l165&quot; &gt;Line 165:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 165:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|AlertsDefined=Yes&lt;/div&gt;&lt;/td&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|AlertsDefined=Yes&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|AlertsIntro=The alerts are based on Flink and Kubernetes cluster metrics.&lt;/div&gt;&lt;/td&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|AlertsIntro=The alerts are based on Flink and Kubernetes cluster metrics.&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class='diff-marker'&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td colspan=&quot;2&quot;&gt;&amp;#160;&lt;/td&gt;&lt;td class='diff-marker'&gt;+&lt;/td&gt;&lt;td style=&quot;color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #a3d3ff; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;ins style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;{{JDNote|Location where Dev defines the GSP alarms: [https://github.com/genesysengage/sre-monitoring-provisioning/blob/master/prometheus/rules/gsp/azure-gsp.yaml https://github.com/genesysengage/sre-monitoring-provisioning/blob/master/prometheus/rules/gsp/azure-gsp.yaml].}}&lt;/ins&gt;&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|PEAlert={{PEAlert&lt;/div&gt;&lt;/td&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|PEAlert={{PEAlert&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|Alert=GspFlinkJobDown&lt;/div&gt;&lt;/td&gt;&lt;td class='diff-marker'&gt;&amp;#160;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #222; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;|Alert=GspFlinkJobDown&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;/table&gt;</summary>
		<author><name>WikiSysop</name></author>
		
	</entry>
	<entry>
		<id>https://all.docs.genesys.com/index.php?title=PEC-REP/Current/GIMPEGuide/GSPMetrics&amp;diff=114353&amp;oldid=prev</id>
		<title>Jose.druker@genesys.com: Published</title>
		<link rel="alternate" type="text/html" href="https://all.docs.genesys.com/index.php?title=PEC-REP/Current/GIMPEGuide/GSPMetrics&amp;diff=114353&amp;oldid=prev"/>
		<updated>2021-12-16T18:12:54Z</updated>

		<summary type="html">&lt;p&gt;Published&lt;/p&gt;
&lt;p&gt;&lt;b&gt;New page&lt;/b&gt;&lt;/p&gt;&lt;div&gt;{{ArticlePEServiceMetrics&lt;br /&gt;
|IncludedServiceId=c39fe496-c79e-4846-b451-1bc8bedb126b&lt;br /&gt;
|CRD=PodMonitor&lt;br /&gt;
|Port=9249&lt;br /&gt;
|Endpoint=Endpoint: /&lt;br /&gt;
&lt;br /&gt;
Selector:&lt;br /&gt;
&amp;lt;source lang=&amp;quot;bash&amp;quot;&amp;gt;matchLabels:&lt;br /&gt;
  app: {{ template &amp;quot;gsp.fullname&amp;quot; . }}&amp;lt;/source&amp;gt;&lt;br /&gt;
where the value of &amp;lt;tt&amp;gt;gsp.fullname&amp;lt;/tt&amp;gt; depends on deployment parameters such as Helm release name, &amp;lt;tt&amp;gt;.Values.fullnameOverride&amp;lt;/tt&amp;gt;, and &amp;lt;tt&amp;gt;.Values.nameOverride&amp;lt;/tt&amp;gt;.&lt;br /&gt;
|MetricsUpdateInterval=30 seconds&lt;br /&gt;
|MetricsDefined=Yes&lt;br /&gt;
|MetricsIntro=GSP exposes some standard Apache Flink and Kafka metrics as well as Genesys-defined metrics, which are exposed via the Flink API. Therefore, all GSP metrics start with the prefix '''flink_''' but in some cases the values are calculated by GSP.&lt;br /&gt;
&lt;br /&gt;
You can query Prometheus directly to see all the metrics Flink and the Flink Kafka connector expose through GSP.&lt;br /&gt;
*For full information about the standard Flink metrics, see the [https://ci.apache.org/projects/flink/flink-docs-release-1.13/docs/ops/metrics/#system-metrics Apache Flink documentation].&lt;br /&gt;
*For full information about the Kafka metrics, see the [https://kafka.apache.org/20/documentation.html#monitoring Apache Kafka] or [https://docs.confluent.io/platform/current/kafka/monitoring.html Confluent Kafka] documentation.&lt;br /&gt;
&lt;br /&gt;
The following metrics are likely to be particularly useful. The naming convention is '''&amp;lt;flink_scope_prefix&amp;gt;_&amp;lt;GSP suffix&amp;gt;'''. Genesys does not commit to maintain other currently available GSP metrics not documented on this page.&lt;br /&gt;
|PEMetric={{PEMetric&lt;br /&gt;
|Metric=flink_taskmanager_job_task_operator_errors_numInvalidRecords&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|MetricDescription=Number of invalid input records.&lt;br /&gt;
|SampleValue=0&lt;br /&gt;
|UsedFor=Error&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_jobmanager_numRunningJobs&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|MetricDescription=Number of running Flink jobs. If less than 1, there is a problem.&lt;br /&gt;
|SampleValue=1&lt;br /&gt;
|UsedFor=Error&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_taskmanager_job_task_operator_user_errors_numOversizedMessages&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|Label=&amp;lt;br /&amp;gt;&lt;br /&gt;
*operator_name&lt;br /&gt;
|MetricDescription=Number of messages exceeding the '''max.request.size''' Kafka option.&lt;br /&gt;
|SampleValue=0&lt;br /&gt;
|UsedFor=Error&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_taskmanager_job_task_operator_tenant_error_total&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|Label=&amp;lt;br /&amp;gt;&lt;br /&gt;
*operator_name&lt;br /&gt;
*tenant&lt;br /&gt;
*error&lt;br /&gt;
|MetricDescription=Number of issues encountered, such as errors or warnings.&lt;br /&gt;
|UsedFor=Error&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_taskmanager_job_task_operator_currentInputWatermark&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|Unit=milliseconds&lt;br /&gt;
|Label=&amp;lt;br /&amp;gt;&lt;br /&gt;
*operator_name&lt;br /&gt;
|MetricDescription=The last watermark received by this operator/task, in milliseconds since the Unix Epoch (00:00:00 UTC on 1 January 1970). &amp;lt;br /&amp;gt; &lt;br /&gt;
'''Note:''' For operators/tasks with two inputs, this is the earlier of the last received watermarks.&lt;br /&gt;
|UsedFor=Latency&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_taskmanager_job_task_operator_currentOutputWatermark&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|Unit=milliseconds&lt;br /&gt;
|Label=&amp;lt;br /&amp;gt;&lt;br /&gt;
*operator_name:&lt;br /&gt;
**Sink:_Agent_State_Facts&lt;br /&gt;
**Sink:_Interaction_Facts&lt;br /&gt;
|MetricDescription=The last watermark this operator has emitted, in milliseconds since the Unix Epoch.&lt;br /&gt;
|UsedFor=Latency&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_taskmanager_job_task_operator_records_lag_max&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|MetricDescription=The maximum lag in terms of the number of records for any partition in this window. An increasing value over time is your best indication that the consumer group is not keeping up with the producers.&lt;br /&gt;
|UsedFor=Latency&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_taskmanager_job_task_operator_records_consumed_rate&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|MetricDescription=The average number of records consumed per second.&lt;br /&gt;
|UsedFor=Traffic&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_taskmanager_job_task_operator_numCallsCreated&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|MetricDescription=Total number of EventCallCreated events GSP received since it started processing.&lt;br /&gt;
|UsedFor=Traffic&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_taskmanager_job_task_operator_numCallsCreatedPerSecond&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|MetricDescription=Number of EventCallCreated events per second (CPS).&lt;br /&gt;
|UsedFor=Traffic&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_taskmanager_job_task_operator_numThreadsCreated&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|MetricDescription=Total number of CallThreads GSP received since it started processing.&lt;br /&gt;
|UsedFor=Traffic&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_taskmanager_job_task_operator_numCallThreadsCreatedPerSecond&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|MetricDescription=Number of CallThreads per second (CTHPS).&lt;br /&gt;
|UsedFor=Traffic&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_taskmanager_job_task_operator_numChainsProcessed&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|MetricDescription=Total number of EventOCSChainStartProcessing events GSP received since it started processing.&lt;br /&gt;
|UsedFor=Traffic&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_taskmanager_job_task_operator_numChainsProcessedPerSecond&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|MetricDescription=Number of EventOCSChainStartProcessing events per second (CPS).&lt;br /&gt;
|UsedFor=Traffic&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_(job&amp;amp;#124;task)manager_Status_JVM_CPU_Load&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|Label=&amp;lt;br /&amp;gt;&lt;br /&gt;
*pod&lt;br /&gt;
|MetricDescription=The recent CPU usage for the JVM process. The value is a double in the &amp;lt;nowiki&amp;gt;[0.0,1.0]&amp;lt;/nowiki&amp;gt; interval, where a value of 0.0 means that none of the CPUs were running threads from the JVM process, while a value of 1.0 means that all CPUs were actively running threads from the JVM 100% of the time during the recent period being observed. A negative value means usage data is not available. For more information, see [https://docs.oracle.com/javase/7/docs/jre/api/management/extension/com/sun/management/OperatingSystemMXBean.html#getProcessCpuLoad() {{#replace:https://docs.oracle.com/javase/7/docs/jre/api/management/extension/com/sun/management/OperatingSystemMXBean.html#getProcessCpuLoad()|/|/&amp;lt;wbr/&amp;gt;}}].&lt;br /&gt;
|SampleValue=&lt;br /&gt;
|UsedFor=Saturation&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_(job&amp;amp;#124;task)manager_Status_JVM_Memory_Direct_TotalCapacity&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|Unit=bytes&lt;br /&gt;
|Label=&amp;lt;br /&amp;gt;&lt;br /&gt;
*pod&lt;br /&gt;
|MetricDescription=The total capacity of all buffers in the direct buffer pool.&lt;br /&gt;
|UsedFor=Saturation&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_(job&amp;amp;#124;task)manager_Status_JVM_Memory_Direct_MemoryUsed&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|Unit=bytes&lt;br /&gt;
|Label=&amp;lt;br /&amp;gt;&lt;br /&gt;
*pod&lt;br /&gt;
|MetricDescription=The amount of memory used by the JVM for the direct buffer pool.&lt;br /&gt;
|UsedFor=Saturation&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_(job&amp;amp;#124;task)manager_Status_JVM_Memory_NonHeap_Max&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|Unit=bytes&lt;br /&gt;
|Label=&amp;lt;br /&amp;gt;&lt;br /&gt;
*pod&lt;br /&gt;
|MetricDescription=The maximum amount of non-heap memory that can be used for memory management.&lt;br /&gt;
|UsedFor=Saturation&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_(job&amp;amp;#124;task)manager_Status_JVM_Memory_NonHeap_Used&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|Unit=bytes&lt;br /&gt;
|Label=&amp;lt;br /&amp;gt;&lt;br /&gt;
*pod&lt;br /&gt;
|MetricDescription=The amount of non-heap memory currently used.&lt;br /&gt;
|UsedFor=Saturation&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_(job&amp;amp;#124;task)manager_Status_JVM_Memory_Heap_Max&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|Unit=bytes&lt;br /&gt;
|Label=&amp;lt;br /&amp;gt;&lt;br /&gt;
*pod&lt;br /&gt;
|MetricDescription=The maximum amount of heap memory that can be used for memory management.&lt;br /&gt;
|UsedFor=Saturation&lt;br /&gt;
}}{{PEMetric&lt;br /&gt;
|Metric=flink_(job&amp;amp;#124;task)manager_Status_JVM_Memory_Heap_Used&lt;br /&gt;
|Type=Gauge&lt;br /&gt;
|Unit=bytes&lt;br /&gt;
|Label=&amp;lt;br /&amp;gt;&lt;br /&gt;
*pod&lt;br /&gt;
|MetricDescription=The amount of heap memory currently used.&lt;br /&gt;
|UsedFor=Saturation&lt;br /&gt;
}}&lt;br /&gt;
|AlertsDefined=Yes&lt;br /&gt;
|AlertsIntro=The alerts are based on Flink and Kubernetes cluster metrics.&lt;br /&gt;
|PEAlert={{PEAlert&lt;br /&gt;
|Alert=GspFlinkJobDown&lt;br /&gt;
|Severity=Critical&lt;br /&gt;
|AlertDescription=Triggered when the GSP Flink job is not running (number of running jobs equals to 0 or metric is not available)&lt;br /&gt;
|BasedOn=flink_jobmanager_numRunningJobs&lt;br /&gt;
|Threshold=For 5 minutes&lt;br /&gt;
}}{{PEAlert&lt;br /&gt;
|Alert=GspOOMKilled&lt;br /&gt;
|Severity=Critical&lt;br /&gt;
|AlertDescription=Triggered when a GSP pod is restarted because of OOMKilled&lt;br /&gt;
|BasedOn=kube_pod_container_status_restarts_total&lt;br /&gt;
|Threshold=0&lt;br /&gt;
}}{{PEAlert&lt;br /&gt;
|Alert=GspNoTmRegistered&lt;br /&gt;
|Severity=Critical&lt;br /&gt;
|AlertDescription=Triggered when there are no registered TaskManagers (or metric not available)&lt;br /&gt;
|BasedOn=flink_jobmanager_numRegisteredTaskManagers&lt;br /&gt;
|Threshold=For 5 minutes&lt;br /&gt;
}}{{PEAlert&lt;br /&gt;
|Alert=GspUnknownPerson&lt;br /&gt;
|Severity=High&lt;br /&gt;
|AlertDescription=Triggered when GSP encounters unknown person(s)&lt;br /&gt;
|BasedOn={{#replace:flink_taskmanager_job_task_operator_tenant_error_total{error=&amp;quot;unknown_person&amp;quot;,service=&amp;quot;gsp&amp;quot;}|_|_&amp;lt;wbr&amp;gt;}}&lt;br /&gt;
|Threshold=For 5 minutes&lt;br /&gt;
}}&lt;br /&gt;
}}&lt;/div&gt;</summary>
		<author><name>Jose.druker@genesys.com</name></author>
		
	</entry>
</feed>