diem_metrics/
lib.rs

1// Copyright (c) The Diem Core Contributors
2// SPDX-License-Identifier: Apache-2.0
3
4// Copyright 2021 Conflux Foundation. All rights reserved.
5// Conflux is free software and distributed under GNU General Public License.
6// See http://www.gnu.org/licenses/
7
8//! # Metrics
9//!
10//! ## Counters
11//!
12//! Used to measure values that are added to over time, rates
13//! can then be used to check how quickly it changes in graphs.
14//! An example would be to add every time an incoming message occurs.
15//! ```
16//! use prometheus::register_int_counter_vec;
17//!
18//! register_int_counter_vec!(
19//!     "name",
20//!     "description",
21//!     &["dimension_1", "dimension_2"]
22//! );
23//! ```
24//!
25//! ## Gauges
26//! Used to measure values that change level over time.  An example
27//! would be to set the number of connected peers.
28//! ```
29//! use prometheus::register_int_gauge_vec;
30//!
31//! register_int_gauge_vec!(
32//!     "name",
33//!     "description",
34//!     &["dimension_1", "dimension_2"]
35//! );
36//! ```
37//!
38//! ## Histograms
39//! Used to measure histogram values.  An example is network
40//! connection latency.
41//! ```
42//! use prometheus::register_histogram_vec;
43//!
44//! register_histogram_vec!(
45//!     "name",
46//!     "description",
47//!     &["dimension_1", "dimension_2"]
48//! );
49//! ```
50
51#![forbid(unsafe_code)]
52#![recursion_limit = "128"]
53
54mod json_encoder;
55mod json_metrics;
56pub mod metric_server;
57mod public_metrics;
58
59mod op_counters;
60pub use op_counters::{DurationHistogram, OpMetrics};
61
62#[cfg(test)]
63mod unit_tests;
64
65// Re-export counter types from prometheus crate
66pub use diem_metrics_core::{
67    register_histogram, register_histogram_vec, register_int_counter,
68    register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
69    Histogram, HistogramTimer, HistogramVec, IntCounter, IntCounterVec,
70    IntGauge, IntGaugeVec,
71};
72
73use anyhow::Result;
74use diem_logger::prelude::*;
75use once_cell::sync::Lazy;
76use prometheus::{proto::MetricType, Encoder, TextEncoder};
77use std::{
78    collections::HashMap,
79    fs::{create_dir_all, File, OpenOptions},
80    io::Write,
81    path::Path,
82    thread, time,
83};
84
85pub static NUM_METRICS: Lazy<IntCounterVec> = Lazy::new(|| {
86    register_int_counter_vec!(
87        "diem_metrics",
88        "Number of metrics in certain states",
89        &["type"]
90    )
91    .unwrap()
92});
93
94fn get_metrics_file<P: AsRef<Path>>(dir_path: &P, file_name: &str) -> File {
95    create_dir_all(dir_path).expect("Create metrics dir failed");
96
97    let metrics_file_path = dir_path.as_ref().join(file_name);
98
99    diem_info!("Using metrics file {}", metrics_file_path.display());
100
101    OpenOptions::new()
102        .append(true)
103        .create(true)
104        .open(metrics_file_path)
105        .expect("Open metrics file failed")
106}
107
108pub fn gather_metrics() -> Vec<prometheus::proto::MetricFamily> {
109    let metric_families = diem_metrics_core::gather();
110    let mut total: u64 = 0;
111    let mut families_over_1000: u64 = 0;
112
113    // Take metrics of metric gathering so we know possible overhead of this
114    // process
115    for metric_family in &metric_families {
116        let family_count = metric_family.get_metric().len();
117        if family_count > 1000 {
118            families_over_1000 = families_over_1000.saturating_add(1);
119            let name = metric_family.get_name();
120            diem_warn!(
121                count = family_count,
122                metric_family = name,
123                "Metric Family '{}' over 1000 dimensions '{}'",
124                name,
125                family_count
126            );
127        }
128        total = total.saturating_add(family_count as u64);
129    }
130
131    // These metrics will be reported on the next pull, rather than create a new
132    // family
133    NUM_METRICS.with_label_values(&["total"]).inc_by(total);
134    NUM_METRICS
135        .with_label_values(&["families_over_1000"])
136        .inc_by(families_over_1000);
137
138    metric_families
139}
140
141fn get_all_metrics_as_serialized_string() -> Result<Vec<u8>> {
142    let all_metrics = gather_metrics();
143
144    let encoder = TextEncoder::new();
145    let mut buffer = Vec::new();
146    encoder.encode(&all_metrics, &mut buffer)?;
147    Ok(buffer)
148}
149
150pub fn get_all_metrics() -> HashMap<String, String> {
151    // TODO: use an existing metric encoder (same as used by
152    // prometheus/metric-server)
153    let all_metric_families = gather_metrics();
154    let mut all_metrics = HashMap::new();
155    for metric_family in all_metric_families {
156        let values: Vec<_> = match metric_family.get_field_type() {
157            MetricType::COUNTER => metric_family
158                .get_metric()
159                .iter()
160                .map(|m| m.get_counter().get_value().to_string())
161                .collect(),
162            MetricType::GAUGE => metric_family
163                .get_metric()
164                .iter()
165                .map(|m| m.get_gauge().get_value().to_string())
166                .collect(),
167            MetricType::SUMMARY => panic!("Unsupported Metric 'SUMMARY'"),
168            MetricType::UNTYPED => panic!("Unsupported Metric 'UNTYPED'"),
169            MetricType::HISTOGRAM => metric_family
170                .get_metric()
171                .iter()
172                .map(|m| m.get_histogram().get_sample_count().to_string())
173                .collect(),
174        };
175        let metric_names = metric_family.get_metric().iter().map(|m| {
176            let label_strings: Vec<String> = m
177                .get_label()
178                .iter()
179                .map(|l| format!("{}={}", l.get_name(), l.get_value()))
180                .collect();
181            let labels_string = format!("{{{}}}", label_strings.join(","));
182            format!("{}{}", metric_family.get_name(), labels_string)
183        });
184
185        for (name, value) in metric_names.zip(values.into_iter()) {
186            all_metrics.insert(name, value);
187        }
188    }
189
190    all_metrics
191}
192
193// Launches a background thread which will periodically collect metrics
194// every interval and write them to the provided file
195pub fn dump_all_metrics_to_file_periodically<P: AsRef<Path>>(
196    dir_path: &P, file_name: &str, interval: u64,
197) {
198    let mut file = get_metrics_file(dir_path, file_name);
199    thread::spawn(move || loop {
200        let mut buffer = get_all_metrics_as_serialized_string()
201            .expect("Error gathering metrics");
202        if !buffer.is_empty() {
203            buffer.push(b'\n');
204            file.write_all(&buffer).expect("Error writing metrics");
205        }
206        thread::sleep(time::Duration::from_millis(interval));
207    });
208}
209
210/// Helper function to record metrics for external calls.
211/// Include call counts, time, and whether it's inside or not (1 or 0).
212/// It assumes a OpMetrics defined as OP_COUNTERS in crate::counters;
213#[macro_export]
214macro_rules! monitor {
215    ($name:literal, $fn:expr) => {{
216        use super::counters::OP_COUNTERS;
217        let _timer = OP_COUNTERS.timer($name);
218        let gauge = OP_COUNTERS.gauge(concat!($name, "_running"));
219        gauge.inc();
220        let result = $fn;
221        gauge.dec();
222        result
223    }};
224}