archivist_lib/
diagnostics.rs

1// Copyright 2020 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use fuchsia_inspect::{
6    ExponentialHistogramParams, HistogramProperty, LinearHistogramParams, Node, NumericProperty,
7    UintExponentialHistogramProperty, UintLinearHistogramProperty, UintProperty,
8};
9use fuchsia_sync::Mutex;
10use std::collections::BTreeMap;
11use std::ffi::CStr;
12use std::sync::atomic::{AtomicUsize, Ordering};
13use std::sync::{Arc, OnceLock};
14use zx::{self as zx, MonotonicDuration};
15
16pub(crate) static TRACE_CATEGORY: &CStr = c"archivist";
17
18// Exponential histograms for time in microseconds contains power-of-two intervals
19static TIME_USEC_PARAMS: ExponentialHistogramParams<u64> =
20    ExponentialHistogramParams { floor: 0, initial_step: 1, step_multiplier: 2, buckets: 26 };
21
22// Linear histogram for max snapshot size in bytes requested by clients.
23// Divide configs into 10kb buckets, from 0mb to 1mb.
24static MAX_SNAPSHOT_SIZE_BYTES_PARAMS: LinearHistogramParams<u64> =
25    LinearHistogramParams { floor: 0, step_size: 10000, buckets: 100 };
26
27// Linear histogram tracking percent of schemas truncated for a given snapshot.
28// Divide configs into 5% buckets, from 0% to 100%.
29static SNAPSHOT_SCHEMA_TRUNCATION_PARAMS: LinearHistogramParams<u64> =
30    LinearHistogramParams { floor: 0, step_size: 5, buckets: 20 };
31
32pub struct AccessorStats {
33    /// Inspect node for tracking usage/health metrics of diagnostics platform.
34    _node: Node,
35
36    /// Metrics aggregated across all client connections.
37    pub global_stats: Arc<GlobalAccessorStats>,
38
39    /// Global stats tracking the usages of StreamDiagnostics for
40    /// exfiltrating inspect data.
41    pub inspect_stats: Arc<GlobalConnectionStats>,
42
43    /// Global stats tracking the usages of StreamDiagnostics for
44    /// exfiltrating logs.
45    pub logs_stats: Arc<GlobalConnectionStats>,
46}
47
48pub struct GlobalAccessorStats {
49    /// Property tracking number of opening connections to any archive_accessor instance.
50    pub connections_opened: UintProperty,
51    /// Property tracking number of closing connections to any archive_accessor instance.
52    pub connections_closed: UintProperty,
53    /// Number of requests to a single ArchiveAccessor to StreamDiagnostics, starting a
54    /// new inspect ReaderServer.
55    pub stream_diagnostics_requests: UintProperty,
56}
57
58impl AccessorStats {
59    pub fn new(node: Node) -> Self {
60        let connections_opened = node.create_uint("connections_opened", 0);
61        let connections_closed = node.create_uint("connections_closed", 0);
62
63        let stream_diagnostics_requests = node.create_uint("stream_diagnostics_requests", 0);
64
65        let inspect_stats = Arc::new(GlobalConnectionStats::new(node.create_child("inspect")));
66        let logs_stats = Arc::new(GlobalConnectionStats::new(node.create_child("logs")));
67
68        AccessorStats {
69            _node: node,
70            global_stats: Arc::new(GlobalAccessorStats {
71                connections_opened,
72                connections_closed,
73                stream_diagnostics_requests,
74            }),
75            inspect_stats,
76            logs_stats,
77        }
78    }
79
80    pub fn new_inspect_batch_iterator(&self) -> BatchIteratorConnectionStats {
81        self.inspect_stats.new_batch_iterator_connection()
82    }
83
84    pub fn new_logs_batch_iterator(&self) -> BatchIteratorConnectionStats {
85        self.logs_stats.new_batch_iterator_connection()
86    }
87}
88
89pub struct GlobalConnectionStats {
90    /// Weak clone of the node that stores stats, used for on-demand population.
91    node: Node,
92    /// Number of DiagnosticsServers created in response to an StreamDiagnostics
93    /// client request.
94    reader_servers_constructed: UintProperty,
95    /// Number of DiagnosticsServers destroyed in response to falling out of scope.
96    reader_servers_destroyed: UintProperty,
97    /// Stats about BatchIterator connections.
98    batch_iterator: GlobalBatchIteratorStats,
99    /// Property tracking number of times a future to retrieve diagnostics data for a component
100    /// timed out.
101    component_timeouts_count: UintProperty,
102    /// Number of times a diagnostics schema had to be truncated because it would otherwise
103    /// cause a component to exceed its configured size budget.
104    schema_truncation_count: UintProperty,
105    /// Optional histogram of processing times for individual components in GetNext
106    component_time_usec: OnceLock<UintExponentialHistogramProperty>,
107    /// Histogram of max aggregated snapshot sizes for overall Snapshot requests.
108    max_snapshot_sizes_bytes: UintLinearHistogramProperty,
109    /// Percentage of schemas in a single snapshot that got truncated.
110    snapshot_schema_truncation_percentage: UintLinearHistogramProperty,
111    /// Longest processing times for individual components, with timestamps.
112    processing_time_tracker: OnceLock<Mutex<ProcessingTimeTracker>>,
113    /// Node under which the batch iterator connections stats are created.
114    batch_iterator_connections: Node,
115    /// The id of the next BatchIterator connection.
116    next_connection_id: AtomicUsize,
117}
118
119impl GlobalConnectionStats {
120    pub fn new(node: Node) -> Self {
121        let reader_servers_constructed = node.create_uint("reader_servers_constructed", 0);
122        let reader_servers_destroyed = node.create_uint("reader_servers_destroyed", 0);
123
124        let batch_iterator = GlobalBatchIteratorStats::new(&node);
125        let component_timeouts_count = node.create_uint("component_timeouts_count", 0);
126
127        let max_snapshot_sizes_bytes = node.create_uint_linear_histogram(
128            "max_snapshot_sizes_bytes",
129            MAX_SNAPSHOT_SIZE_BYTES_PARAMS.clone(),
130        );
131
132        let snapshot_schema_truncation_percentage = node.create_uint_linear_histogram(
133            "snapshot_schema_truncation_percentage",
134            SNAPSHOT_SCHEMA_TRUNCATION_PARAMS.clone(),
135        );
136
137        let schema_truncation_count = node.create_uint("schema_truncation_count", 0);
138        let batch_iterator_connections = node.create_child("batch_iterator_connections");
139
140        GlobalConnectionStats {
141            node,
142            reader_servers_constructed,
143            reader_servers_destroyed,
144            batch_iterator,
145            batch_iterator_connections,
146            component_timeouts_count,
147            max_snapshot_sizes_bytes,
148            snapshot_schema_truncation_percentage,
149            schema_truncation_count,
150            component_time_usec: OnceLock::new(),
151            processing_time_tracker: OnceLock::new(),
152            next_connection_id: AtomicUsize::new(0),
153        }
154    }
155
156    fn new_batch_iterator_connection(self: &Arc<Self>) -> BatchIteratorConnectionStats {
157        let node = self
158            .batch_iterator_connections
159            .create_child(self.next_connection_id.fetch_add(1, Ordering::Relaxed).to_string());
160        BatchIteratorConnectionStats::new(node, Arc::clone(self))
161    }
162
163    pub fn add_timeout(&self) {
164        self.component_timeouts_count.add(1);
165    }
166
167    pub fn record_percent_truncated_schemas(&self, percent_truncated_schemas: u64) {
168        self.snapshot_schema_truncation_percentage.insert(percent_truncated_schemas);
169    }
170
171    pub fn record_max_snapshot_size_config(&self, max_snapshot_size_config: u64) {
172        self.max_snapshot_sizes_bytes.insert(max_snapshot_size_config);
173    }
174
175    /// Record the duration of a whole request to GetNext.
176    pub fn record_batch_duration(&self, duration: MonotonicDuration) {
177        let micros = duration.into_micros();
178        if micros >= 0 {
179            self.batch_iterator.get_next.time_usec.insert(micros as u64);
180        }
181    }
182
183    /// Record the duration of obtaining data from a single component.
184    pub fn record_component_duration(&self, moniker: impl AsRef<str>, duration: MonotonicDuration) {
185        let nanos = duration.into_nanos();
186        if nanos >= 0 {
187            // Lazily initialize stats that may not be needed for all diagnostics types.
188
189            let component_time_usec = self.component_time_usec.get_or_init(|| {
190                self.node.create_uint_exponential_histogram(
191                    "component_time_usec",
192                    TIME_USEC_PARAMS.clone(),
193                )
194            });
195
196            let processing_time_tracker = self.processing_time_tracker.get_or_init(|| {
197                Mutex::new(ProcessingTimeTracker::new(
198                    self.node.create_child("longest_processing_times"),
199                ))
200            });
201
202            component_time_usec.insert(nanos as u64 / 1000);
203            processing_time_tracker.lock().track(moniker.as_ref(), nanos as u64);
204        }
205    }
206}
207
208struct GlobalBatchIteratorStats {
209    _node: Node,
210    /// Property tracking number of opening connections to any batch iterator instance.
211    connections_opened: UintProperty,
212    /// Property tracking number of closing connections to any batch iterator instance.
213    connections_closed: UintProperty,
214    get_next: GlobalBatchIteratorGetNextStats,
215}
216
217impl GlobalBatchIteratorStats {
218    fn new(parent: &Node) -> Self {
219        let node = parent.create_child("batch_iterator");
220        let connections_opened = node.create_uint("connections_opened", 0);
221        let connections_closed = node.create_uint("connections_closed", 0);
222        let get_next = GlobalBatchIteratorGetNextStats::new(&node);
223        Self { _node: node, connections_opened, connections_closed, get_next }
224    }
225}
226
227struct GlobalBatchIteratorGetNextStats {
228    _node: Node,
229    /// Number of times "GetNext" was called
230    requests: UintProperty,
231    /// Number of times a "GetNext" response was sent
232    responses: UintProperty,
233    /// Number of items returned in batches from "GetNext"
234    result_count: UintProperty,
235    /// Number of items returned in batches from "GetNext" that contained errors
236    result_errors: UintProperty,
237    /// Histogram of processing times for overall "GetNext" requests.
238    time_usec: UintExponentialHistogramProperty,
239}
240
241impl GlobalBatchIteratorGetNextStats {
242    fn new(parent: &Node) -> Self {
243        let node = parent.create_child("get_next");
244        let requests = node.create_uint("requests", 0);
245        let responses = node.create_uint("responses", 0);
246        let result_count = node.create_uint("result_count", 0);
247        let result_errors = node.create_uint("result_errors", 0);
248        let time_usec =
249            node.create_uint_exponential_histogram("time_usec", TIME_USEC_PARAMS.clone());
250        Self { _node: node, requests, responses, result_count, result_errors, time_usec }
251    }
252}
253
254const PROCESSING_TIME_COMPONENT_COUNT_LIMIT: usize = 20;
255
256/// Holds stats on the longest processing times for individual components' data.
257struct ProcessingTimeTracker {
258    /// The node holding all properties for the tracker.
259    node: Node,
260    /// Map from component moniker to a tuple of its time and a node containing the stats about it.
261    longest_times_by_component: BTreeMap<String, (u64, Node)>,
262    /// The shortest time seen so far. If a new component is being
263    /// recorded and its time is greater than this, we need to pop the
264    /// entry containing this time.
265    shortest_time_ns: u64,
266}
267
268impl ProcessingTimeTracker {
269    fn new(node: Node) -> Self {
270        Self { node, longest_times_by_component: BTreeMap::new(), shortest_time_ns: u64::MAX }
271    }
272    fn track(&mut self, moniker: &str, time_ns: u64) {
273        let at_capacity =
274            self.longest_times_by_component.len() >= PROCESSING_TIME_COMPONENT_COUNT_LIMIT;
275
276        // Do nothing if the container it as the limit and the new time doesn't need to get
277        // inserted.
278        if at_capacity && time_ns < self.shortest_time_ns {
279            return;
280        }
281
282        let parent_node = &self.node;
283
284        let make_entry = || {
285            let n = parent_node.create_child(moniker.to_string());
286            n.record_int("@time", zx::BootInstant::get().into_nanos());
287            n.record_double("duration_seconds", time_ns as f64 / 1e9);
288            (time_ns, n)
289        };
290
291        self.longest_times_by_component
292            .entry(moniker.to_string())
293            .and_modify(move |v| {
294                if v.0 < time_ns {
295                    *v = make_entry();
296                }
297            })
298            .or_insert_with(make_entry);
299
300        // Repeatedly find the key for the smallest time and remove it until we are under the
301        // limit.
302        while self.longest_times_by_component.len() > PROCESSING_TIME_COMPONENT_COUNT_LIMIT {
303            let mut key = "".to_string();
304            for (k, (val, _)) in &self.longest_times_by_component {
305                if *val == self.shortest_time_ns {
306                    key.clone_from(k);
307                    break;
308                }
309            }
310            self.longest_times_by_component.remove(&key);
311            self.shortest_time_ns =
312                self.longest_times_by_component.values().map(|v| v.0).min().unwrap_or(u64::MAX);
313        }
314
315        self.shortest_time_ns = std::cmp::min(self.shortest_time_ns, time_ns);
316    }
317}
318
319pub struct BatchIteratorConnectionStats {
320    /// Inspect node for tracking usage/health metrics of a single connection to a batch iterator.
321    _node: Node,
322    /// Global stats for connections to the BatchIterator protocol.
323    global_stats: Arc<GlobalConnectionStats>,
324    /// Property tracking number of requests to the BatchIterator instance this struct is tracking.
325    get_next_requests: UintProperty,
326    /// Property tracking number of responses from the BatchIterator instance this struct is tracking.
327    get_next_responses: UintProperty,
328    /// Property tracking number of times the batch iterator has served a terminal batch signalling that
329    /// the client has reached the end of the iterator and should terminate their connection.
330    get_next_terminal_responses: UintProperty,
331}
332
333impl BatchIteratorConnectionStats {
334    fn new(node: Node, global_stats: Arc<GlobalConnectionStats>) -> Self {
335        // we'll decrement these on drop
336        global_stats.reader_servers_constructed.add(1);
337
338        let get_next = node.create_child("get_next");
339        let get_next_requests = get_next.create_uint("requests", 0);
340        let get_next_responses = get_next.create_uint("responses", 0);
341        let get_next_terminal_responses = get_next.create_uint("terminal_responses", 0);
342        node.record(get_next);
343
344        Self {
345            _node: node,
346            global_stats,
347            get_next_requests,
348            get_next_responses,
349            get_next_terminal_responses,
350        }
351    }
352
353    pub fn open_connection(&self) {
354        self.global_stats.batch_iterator.connections_opened.add(1);
355    }
356
357    pub fn close_connection(&self) {
358        self.global_stats.batch_iterator.connections_closed.add(1);
359    }
360
361    pub fn global_stats(&self) -> &Arc<GlobalConnectionStats> {
362        &self.global_stats
363    }
364
365    pub fn add_request(&self) {
366        self.global_stats.batch_iterator.get_next.requests.add(1);
367        self.get_next_requests.add(1);
368    }
369
370    pub fn add_response(&self) {
371        self.global_stats.batch_iterator.get_next.responses.add(1);
372        self.get_next_responses.add(1);
373    }
374
375    pub fn add_terminal(&self) {
376        self.get_next_terminal_responses.add(1);
377    }
378
379    pub fn add_result(&self) {
380        self.global_stats.batch_iterator.get_next.result_count.add(1);
381    }
382
383    pub fn add_result_error(&self) {
384        self.global_stats.batch_iterator.get_next.result_errors.add(1);
385    }
386
387    pub fn add_schema_truncated(&self) {
388        self.global_stats.schema_truncation_count.add(1);
389    }
390}
391
392impl Drop for BatchIteratorConnectionStats {
393    fn drop(&mut self) {
394        self.global_stats.reader_servers_destroyed.add(1);
395    }
396}
397
398#[cfg(test)]
399mod test {
400    use super::*;
401    use diagnostics_assertions::{assert_data_tree, AnyProperty};
402    use fuchsia_inspect::health::Reporter;
403    use fuchsia_inspect::{component, Inspector};
404
405    #[fuchsia::test]
406    fn health() {
407        component::health().set_ok();
408        assert_data_tree!(component::inspector(),
409        root: {
410            "fuchsia.inspect.Health": {
411                status: "OK",
412                start_timestamp_nanos: AnyProperty,
413            }
414        });
415
416        component::health().set_unhealthy("Bad state");
417        assert_data_tree!(component::inspector(),
418        root: contains {
419            "fuchsia.inspect.Health": {
420                status: "UNHEALTHY",
421                message: "Bad state",
422                start_timestamp_nanos: AnyProperty,
423            }
424        });
425
426        component::health().set_ok();
427        assert_data_tree!(component::inspector(),
428        root: contains {
429            "fuchsia.inspect.Health": {
430                status: "OK",
431                start_timestamp_nanos: AnyProperty,
432            }
433        });
434    }
435
436    #[fuchsia::test]
437    fn processing_time_tracker() {
438        let inspector = Inspector::default();
439        let mut tracker = ProcessingTimeTracker::new(inspector.root().create_child("test"));
440
441        tracker.track("a", 1e9 as u64);
442        assert_data_tree!(inspector,
443        root: {
444            test: {
445                a: {
446                    "@time": AnyProperty,
447                    duration_seconds: 1f64
448                }
449            }
450        });
451
452        tracker.track("a", 5e8 as u64);
453        assert_data_tree!(inspector,
454        root: {
455            test: {
456                a: {
457                    "@time": AnyProperty,
458                    duration_seconds: 1f64
459                }
460            }
461        });
462
463        tracker.track("a", 5500e6 as u64);
464        assert_data_tree!(inspector,
465        root: {
466            test: {
467                a: {
468                    "@time": AnyProperty,
469                    duration_seconds: 5.5f64
470                }
471            }
472        });
473
474        for time in 0..60 {
475            tracker.track(&format!("b{time}"), time * 1e9 as u64);
476        }
477
478        assert_data_tree!(inspector,
479        root: {
480            test: {
481                b40: { "@time": AnyProperty, duration_seconds: 40f64 },
482                b41: { "@time": AnyProperty, duration_seconds: 41f64 },
483                b42: { "@time": AnyProperty, duration_seconds: 42f64 },
484                b43: { "@time": AnyProperty, duration_seconds: 43f64 },
485                b44: { "@time": AnyProperty, duration_seconds: 44f64 },
486                b45: { "@time": AnyProperty, duration_seconds: 45f64 },
487                b46: { "@time": AnyProperty, duration_seconds: 46f64 },
488                b47: { "@time": AnyProperty, duration_seconds: 47f64 },
489                b48: { "@time": AnyProperty, duration_seconds: 48f64 },
490                b49: { "@time": AnyProperty, duration_seconds: 49f64 },
491                b50: { "@time": AnyProperty, duration_seconds: 50f64 },
492                b51: { "@time": AnyProperty, duration_seconds: 51f64 },
493                b52: { "@time": AnyProperty, duration_seconds: 52f64 },
494                b53: { "@time": AnyProperty, duration_seconds: 53f64 },
495                b54: { "@time": AnyProperty, duration_seconds: 54f64 },
496                b55: { "@time": AnyProperty, duration_seconds: 55f64 },
497                b56: { "@time": AnyProperty, duration_seconds: 56f64 },
498                b57: { "@time": AnyProperty, duration_seconds: 57f64 },
499                b58: { "@time": AnyProperty, duration_seconds: 58f64 },
500                b59: { "@time": AnyProperty, duration_seconds: 59f64 },
501            }
502        });
503    }
504}