reachability_core/
watchdog.rs

1// Copyright 2022 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! Provides an interface health watchdog.
6//!
7//! The watchdog uses gateway neighbor reachability information and interface
8//! counters to evaluate interface health and triggers debug information dumps
9//! on the system logs when it finds unhealthy interfaces.
10
11use crate::neighbor_cache::NeighborHealth;
12use crate::{Id as InterfaceId, InterfaceView};
13use fidl_fuchsia_net_interfaces_ext as fnet_interfaces_ext;
14use itertools::Itertools as _;
15use log::{debug, error, info, warn};
16use std::collections::HashMap;
17
18/// The minimum amount of time for a device counter to be stuck in the same
19/// value for the device to be considered unhealthy.
20const DEVICE_COUNTERS_UNHEALTHY_TIME: zx::MonotonicDuration =
21    zx::MonotonicDuration::from_minutes(2);
22
23/// The minimum amount of time to wait before generating a new request for debug
24/// information.
25const DEBUG_INFO_COOLDOWN: zx::MonotonicDuration = zx::MonotonicDuration::from_minutes(15);
26
27/// The minimum amount of time for a neighbor in unhealthy state to trigger
28/// actions.
29const NEIGHBOR_UNHEALTHY_TIME: zx::MonotonicDuration = zx::MonotonicDuration::from_minutes(1);
30
31#[derive(Debug, thiserror::Error)]
32#[cfg_attr(test, derive(Clone))]
33pub enum Error {
34    #[error("Operation timed out")]
35    Timeout,
36    #[error("FIDL error {0}")]
37    Fidl(#[from] fidl::Error),
38    #[error("Unsupported operation")]
39    NotSupported,
40}
41
42#[derive(Debug)]
43#[cfg_attr(test, derive(Copy, Clone))]
44pub struct DeviceCounters {
45    pub rx_frames: u64,
46    pub tx_frames: u64,
47}
48
49#[derive(Debug)]
50#[cfg_attr(test, derive(Eq, PartialEq))]
51struct TimestampedCounter {
52    value: u64,
53    at: zx::MonotonicInstant,
54}
55
56impl TimestampedCounter {
57    fn update(&mut self, new_value: u64, new_at: zx::MonotonicInstant) -> bool {
58        let Self { value, at } = self;
59        if new_value != *value {
60            *at = new_at;
61            *value = new_value;
62            true
63        } else {
64            false
65        }
66    }
67}
68
69#[async_trait::async_trait]
70pub trait DeviceDiagnosticsProvider {
71    async fn get_counters(&self) -> Result<DeviceCounters, Error>;
72
73    async fn log_debug_info(&self) -> Result<(), Error>;
74}
75
76#[derive(Debug)]
77#[cfg_attr(test, derive(Eq, PartialEq))]
78enum HealthStatus {
79    Unhealthy { last_action: zx::MonotonicInstant },
80    Healthy { last_action: Option<zx::MonotonicInstant> },
81}
82
83impl HealthStatus {
84    /// Sets the status to unhealthy at time `now`.
85    ///
86    /// Return `true` if a debug info action should be triggered respecting
87    /// cooldown.
88    fn set_unhealthy_and_check_for_debug_info_cooldown(
89        &mut self,
90        now: zx::MonotonicInstant,
91    ) -> bool {
92        let last_action = match self {
93            HealthStatus::Unhealthy { last_action } => Some(*last_action),
94            HealthStatus::Healthy { last_action } => *last_action,
95        };
96
97        let (trigger_debug_info, last_action) = match last_action
98            .map(|last_action| (now - last_action >= DEBUG_INFO_COOLDOWN, last_action))
99        {
100            // Either we haven't yet triggered a debug info action or we've
101            // passed the cooldown period.
102            None | Some((true, _)) => (true, now),
103            // We're still in the cooldown period from the last triggered
104            // action.
105            Some((false, last_action)) => (false, last_action),
106        };
107
108        *self = HealthStatus::Unhealthy { last_action: last_action };
109
110        return trigger_debug_info;
111    }
112
113    /// Sets the system health status to healthy.
114    fn set_healthy(&mut self) {
115        match self {
116            HealthStatus::Unhealthy { last_action } => {
117                *self = HealthStatus::Healthy { last_action: Some(*last_action) };
118            }
119            HealthStatus::Healthy { last_action: _ } => {}
120        }
121    }
122}
123
124#[derive(Debug)]
125struct InterfaceDiagnosticsState<D> {
126    diagnostics: D,
127    rx: TimestampedCounter,
128    tx: TimestampedCounter,
129    updated_at: zx::MonotonicInstant,
130    health: HealthStatus,
131}
132
133#[derive(Debug)]
134struct InterfaceState<D> {
135    diagnostics_state: Option<InterfaceDiagnosticsState<D>>,
136}
137
138pub struct Watchdog<S: SystemDispatcher> {
139    interfaces: HashMap<InterfaceId, InterfaceState<S::DeviceDiagnostics>>,
140    system_health_status: HealthStatus,
141    _marker: std::marker::PhantomData<S>,
142}
143
144#[async_trait::async_trait]
145pub trait SystemDispatcher {
146    type DeviceDiagnostics: DeviceDiagnosticsProvider;
147
148    async fn log_debug_info(&self) -> Result<(), Error>;
149
150    fn get_device_diagnostics(
151        &self,
152        interface: InterfaceId,
153    ) -> Result<Self::DeviceDiagnostics, Error>;
154}
155
156#[derive(Debug, Eq, PartialEq, Clone, Copy)]
157enum ActionReason {
158    CantFetchCounters,
159    DeviceRxStall,
160    DeviceTxStall,
161}
162
163#[derive(Debug, Eq, PartialEq)]
164struct Action {
165    trigger_stack_diagnosis: bool,
166    trigger_device_diagnosis: bool,
167    reason: ActionReason,
168}
169
170impl<S> Watchdog<S>
171where
172    S: SystemDispatcher,
173{
174    pub fn new() -> Self {
175        Self {
176            interfaces: HashMap::new(),
177            system_health_status: HealthStatus::Healthy { last_action: None },
178            _marker: std::marker::PhantomData,
179        }
180    }
181
182    async fn initialize_interface_state(
183        now: zx::MonotonicInstant,
184        sys: &S,
185        interface: InterfaceId,
186    ) -> Option<InterfaceDiagnosticsState<S::DeviceDiagnostics>> {
187        // Get a diagnostics handle and read the initial counters.
188        let diagnostics = match sys.get_device_diagnostics(interface) {
189            Ok(d) => d,
190            Err(e) => {
191                warn!(
192                    err:? = e,
193                    iface = interface;
194                    "failed to read diagnostics state, assuming unsupported interface"
195                );
196                return None;
197            }
198        };
199        let DeviceCounters { rx_frames, tx_frames } = match diagnostics.get_counters().await {
200            Ok(c) => c,
201            Err(e) => {
202                warn!(
203                    err:? = e,
204                    iface = interface;
205                    "failed to read device counters, assuming unsupported interface"
206                );
207                return None;
208            }
209        };
210        Some(InterfaceDiagnosticsState {
211            diagnostics,
212            rx: TimestampedCounter { value: rx_frames, at: now },
213            tx: TimestampedCounter { value: tx_frames, at: now },
214            updated_at: now,
215            health: HealthStatus::Healthy { last_action: None },
216        })
217    }
218
219    pub async fn check_interface_state(
220        &mut self,
221        now: zx::MonotonicInstant,
222        sys: &S,
223        view: InterfaceView<'_>,
224    ) {
225        debug!(view:? = view; "poll interface state");
226        let Self { interfaces, system_health_status, _marker: _ } = self;
227
228        let interface = view.properties.id;
229
230        let InterfaceState { diagnostics_state } = match interfaces.entry(interface.get()) {
231            std::collections::hash_map::Entry::Occupied(entry) => entry.into_mut(),
232            std::collections::hash_map::Entry::Vacant(vacant) => vacant.insert(InterfaceState {
233                diagnostics_state: Self::initialize_interface_state(now, sys, interface.get())
234                    .await,
235            }),
236        };
237
238        let diagnostics_state = if let Some(d) = diagnostics_state.as_mut() {
239            d
240        } else {
241            // Do nothing for unsupported interfaces, we can't get counters or
242            // trigger debug info on them.
243            return;
244        };
245
246        if let Some(action) = Self::evaluate_interface_state(now, diagnostics_state, view).await {
247            info!(
248                action:? = action,
249                iface = interface;
250                "bad state detected, action requested"
251            );
252            let Action { trigger_stack_diagnosis, trigger_device_diagnosis, reason: _ } = action;
253            if trigger_device_diagnosis {
254                diagnostics_state.diagnostics.log_debug_info().await.unwrap_or_else(
255                    |e| error!(err:? = e, iface = interface; "failed to request device debug info"),
256                );
257            }
258            if trigger_stack_diagnosis {
259                if system_health_status.set_unhealthy_and_check_for_debug_info_cooldown(now) {
260                    sys.log_debug_info().await.unwrap_or_else(
261                        |e| error!(err:? = e; "failed to request system debug info"),
262                    );
263                }
264            }
265        }
266    }
267
268    /// Evaluates the given interface state, returning an optional debugging
269    /// action to be triggered.
270    ///
271    /// Interfaces are evaluated at two levels. First, all the gateways are
272    /// evaluated against the neighbor table. Second, if all gateways are
273    /// unhealthy, the device counters are polled until a stall is observed. If
274    /// an Rx or Tx stall is seen, a debug action will be requested.
275    ///
276    /// If there's a timeout attempting to fetch interface counters, a debug
277    /// request may also be issued.
278    async fn evaluate_interface_state(
279        now: zx::MonotonicInstant,
280        diag_state: &mut InterfaceDiagnosticsState<S::DeviceDiagnostics>,
281        InterfaceView {
282            properties: fnet_interfaces_ext::Properties { id: interface, .. },
283            routes,
284            neighbors,
285        }: InterfaceView<'_>,
286    ) -> Option<Action> {
287        let InterfaceDiagnosticsState { diagnostics, rx, tx, updated_at, health } = diag_state;
288        let interface = *interface;
289
290        debug!(iface = interface; "evaluate interface state");
291
292        let mut neighbors = neighbors.as_ref()?.iter_health();
293        let found_healthy_gateway = neighbors
294            .fold_while(None, |found_healthy_gateway, (neighbor, health)| {
295                let is_router = routes.device_routes(interface.get()).any(|route| {
296                    route.next_hop.map(|next_hop| *neighbor == next_hop).unwrap_or(false)
297                });
298
299                if !is_router {
300                    return itertools::FoldWhile::Continue(found_healthy_gateway);
301                }
302
303                let gateway_health = GatewayHealth::from_neighbor_health(health, now);
304                debug!(
305                    iface = interface,
306                    neighbor:? = fidl_fuchsia_net_ext::IpAddress::from(neighbor.clone()),
307                    health:? = gateway_health;
308                    "router check"
309                );
310                match gateway_health {
311                    // When we find a healthy neighbor, immediately break the
312                    // fold.
313                    GatewayHealth::Healthy
314                    // A gateway that hasn't been unhealthy for a long time may
315                    // only be going through a temporary outage.
316                    | GatewayHealth::RecentlyUnhealthy
317                    // Unknown gateway state is assumed to be healthy. Expected
318                    // to shift once neighbor table fills up.
319                    | GatewayHealth::Unknown
320                    => {
321                        itertools::FoldWhile::Done(Some(true))
322                    }
323                    // A gateway that was never healthy is considered a
324                    // misconfiguration and should not trip the watchdog.
325                    // Skip it entirely so it's not considered for the search.
326                    | GatewayHealth::NeverHealthy => {
327                        itertools::FoldWhile::Continue(found_healthy_gateway)
328                    }
329                    GatewayHealth::Unhealthy => itertools::FoldWhile::Continue(Some(false)),
330                }
331            })
332            .into_inner();
333
334        match found_healthy_gateway {
335            // If there are no gateways, there's not much we can do. Assume that
336            // either the interface is not configured for upstream connectivity
337            // or we're going through a link flap event.
338            None => {
339                debug!(iface = interface; "no gateway in neighbors");
340                return None;
341            }
342            // If there's at least one healthy gateway, there's no action to be
343            // taken, but we can mark the interface as healthy.
344            Some(true) => {
345                debug!(iface = interface; "neighbors are healthy");
346                health.set_healthy();
347                return None;
348            }
349            // If we found at least one gateway and they're all unhealthy,
350            // proceed to check device counters.
351            Some(false) => (),
352        }
353
354        let counters = match diagnostics.get_counters().await {
355            Ok(counters) => counters,
356            Err(Error::Timeout) => {
357                return Some(Action {
358                    trigger_stack_diagnosis: false,
359                    trigger_device_diagnosis: true,
360                    reason: ActionReason::CantFetchCounters,
361                });
362            }
363            Err(Error::Fidl(e)) => {
364                if !e.is_closed() {
365                    error!(
366                        e:? = e,
367                        iface = interface;
368                        "failed to read counters for interface, no action will be taken"
369                    );
370                }
371                return None;
372            }
373            Err(Error::NotSupported) => {
374                error!(
375                    iface = interface;
376                    "failed to read counters for interface, no action will be taken"
377                );
378                return None;
379            }
380        };
381        let DeviceCounters { rx_frames, tx_frames } = counters;
382        if !rx.update(rx_frames, now) {
383            warn!(
384                rx:? = rx,
385                now = now.into_nanos(),
386                iface = interface;
387                "failed to observe rx traffic since last check"
388            );
389        }
390        if !tx.update(tx_frames, now) {
391            warn!(
392                tx:? = tx,
393                now = now.into_nanos(),
394                iface = interface;
395                "failed to observe tx traffic since last check"
396            );
397        }
398        *updated_at = now;
399        if let Some(reason) = [(rx, ActionReason::DeviceRxStall), (tx, ActionReason::DeviceTxStall)]
400            .iter()
401            .find_map(|(TimestampedCounter { value: _, at }, reason)| {
402                (now - *at >= DEVICE_COUNTERS_UNHEALTHY_TIME).then_some(*reason)
403            })
404        {
405            let action = health.set_unhealthy_and_check_for_debug_info_cooldown(now).then_some({
406                Action { trigger_stack_diagnosis: true, trigger_device_diagnosis: true, reason }
407            });
408
409            return action;
410        }
411
412        info!(
413            iface = interface,
414            rx = rx_frames,
415            tx = tx_frames;
416            "gateways are unhealthy, but counters are healthy."
417        );
418
419        // Counters are not stalled, mark the interface as healthy.
420        health.set_healthy();
421
422        None
423    }
424
425    pub fn handle_interface_removed(&mut self, interface: InterfaceId) {
426        let Self { interfaces, system_health_status: _, _marker: _ } = self;
427        match interfaces.remove(&interface) {
428            Some(InterfaceState { .. }) => (),
429            None => error!(iface = interface; "attempted to remove unknown interface"),
430        }
431    }
432}
433
434#[derive(Debug, PartialEq, Eq)]
435enum GatewayHealth {
436    Unknown,
437    Healthy,
438    RecentlyUnhealthy,
439    Unhealthy,
440    NeverHealthy,
441}
442
443impl GatewayHealth {
444    /// Checks if a gateway with reported `health` should be considered healthy.
445    fn from_neighbor_health(health: &NeighborHealth, now: zx::MonotonicInstant) -> Self {
446        match health {
447            NeighborHealth::Unknown => Self::Unknown,
448            NeighborHealth::Healthy { last_observed: _ } => Self::Healthy,
449            NeighborHealth::Unhealthy { last_healthy: None } => Self::NeverHealthy,
450            NeighborHealth::Unhealthy { last_healthy: Some(last_healthy) } => {
451                if now - *last_healthy < NEIGHBOR_UNHEALTHY_TIME {
452                    Self::RecentlyUnhealthy
453                } else {
454                    Self::Unhealthy
455                }
456            }
457        }
458    }
459}
460
461#[cfg(test)]
462mod tests {
463    use super::*;
464
465    use crate::route_table::Route;
466    use assert_matches::assert_matches;
467    use fidl_fuchsia_net as fnet;
468    use fuchsia_sync::Mutex;
469    use futures::FutureExt as _;
470    use net_declare::{fidl_ip, fidl_subnet};
471    use std::sync::Arc;
472
473    use crate::neighbor_cache::NeighborState;
474    use crate::route_table::RouteTable;
475    use crate::testutil;
476
477    #[test]
478    fn health_status_healthy() {
479        let now = SOME_TIME;
480        let mut status = HealthStatus::Healthy { last_action: None };
481        assert!(status.set_unhealthy_and_check_for_debug_info_cooldown(now));
482        assert_eq!(status, HealthStatus::Unhealthy { last_action: now });
483
484        status = HealthStatus::Healthy { last_action: Some(now) };
485        let later = now + zx::MonotonicDuration::from_seconds(1);
486        assert!(!status.set_unhealthy_and_check_for_debug_info_cooldown(later));
487        assert_eq!(status, HealthStatus::Unhealthy { last_action: now });
488
489        status = HealthStatus::Healthy { last_action: Some(now) };
490        let later = now + DEBUG_INFO_COOLDOWN;
491        assert!(status.set_unhealthy_and_check_for_debug_info_cooldown(later));
492        assert_eq!(status, HealthStatus::Unhealthy { last_action: later });
493    }
494
495    #[test]
496    fn health_status_unhealthy() {
497        let now = SOME_TIME;
498        let mut status = HealthStatus::Unhealthy { last_action: now };
499        let later = now + zx::MonotonicDuration::from_seconds(1);
500        assert!(!status.set_unhealthy_and_check_for_debug_info_cooldown(later));
501        assert_eq!(status, HealthStatus::Unhealthy { last_action: now });
502
503        let later = now + DEBUG_INFO_COOLDOWN;
504        assert!(status.set_unhealthy_and_check_for_debug_info_cooldown(later));
505        assert_eq!(status, HealthStatus::Unhealthy { last_action: later });
506    }
507
508    #[test]
509    fn timestamped_counter() {
510        let now = SOME_TIME;
511        let mut counter = TimestampedCounter { value: 1, at: now };
512
513        let later = now + zx::MonotonicDuration::from_seconds(1);
514        assert!(!counter.update(1, later));
515        assert_eq!(counter, TimestampedCounter { value: 1, at: now });
516
517        assert!(counter.update(2, later));
518        assert_eq!(counter, TimestampedCounter { value: 2, at: later });
519    }
520
521    #[fuchsia::test]
522    async fn initialize_interface_state() {
523        let now = SOME_TIME;
524
525        let sys = MockSystem::default();
526        assert_matches!(Watchdog::initialize_interface_state(now, &sys, IFACE1).await, None);
527
528        let counters = DeviceCounters { rx_frames: 1, tx_frames: 2 };
529        sys.insert_interface_diagnostics(IFACE1);
530        sys.increment_counters(IFACE1, counters.clone());
531
532        let InterfaceDiagnosticsState { diagnostics: _, rx, tx, updated_at, health } =
533            Watchdog::initialize_interface_state(now, &sys, IFACE1)
534                .await
535                .expect("failed to init interface");
536        assert_eq!(rx, TimestampedCounter { value: counters.rx_frames, at: now });
537        assert_eq!(tx, TimestampedCounter { value: counters.tx_frames, at: now });
538        assert_eq!(updated_at, now);
539        assert_eq!(health, HealthStatus::Healthy { last_action: None });
540    }
541
542    #[fuchsia::test]
543    async fn no_action_if_no_neighbors() {
544        let sys = MockSystem::default();
545        let now = SOME_TIME;
546        let mut state = sys.new_diagnostics_state(now, IFACE1);
547        let view = MockInterfaceView::new(IFACE1, None, None);
548        assert_eq!(Watchdog::evaluate_interface_state(now, &mut state, view.view()).await, None);
549        assert_eq!(
550            Watchdog::evaluate_interface_state(
551                now,
552                &mut state,
553                InterfaceView { neighbors: None, ..view.view() }
554            )
555            .await,
556            None
557        );
558    }
559
560    #[fuchsia::test]
561    async fn no_action_if_unreachable_neighbor_isnt_gateway() {
562        let sys = MockSystem::default();
563        let now = SOME_TIME;
564        let mut state = sys.new_diagnostics_state(now, IFACE1);
565        let view = MockInterfaceView::new(IFACE1, None, [(NEIGH_V4, UNHEALTHY_NEIGHBOR)]);
566        assert_eq!(Watchdog::evaluate_interface_state(now, &mut state, view.view()).await, None);
567    }
568
569    #[fuchsia::test]
570    async fn poll_counters_if_neighbor_is_gateway() {
571        let sys = MockSystem::default();
572        let now = SOME_TIME;
573        let mut state = sys.new_diagnostics_state(now, IFACE1);
574        let view = MockInterfaceView::new(
575            IFACE1,
576            [Route {
577                destination: SUBNET_V4,
578                outbound_interface: IFACE1,
579                next_hop: Some(NEIGH_V4),
580            }],
581            [(NEIGH_V4, UNHEALTHY_NEIGHBOR)],
582        );
583        sys.set_counters_return_timeout(IFACE1);
584        assert_eq!(
585            Watchdog::evaluate_interface_state(now, &mut state, view.view()).await,
586            Some(Action {
587                trigger_stack_diagnosis: false,
588                trigger_device_diagnosis: true,
589                reason: ActionReason::CantFetchCounters
590            })
591        );
592    }
593
594    #[fuchsia::test]
595    async fn ignore_never_healthy_neighbors() {
596        const NEVER_HEALTHY_NEIGHBOR: NeighborState =
597            NeighborState::new(NeighborHealth::Unhealthy { last_healthy: None });
598
599        let sys = MockSystem::default();
600        let now = SOME_TIME;
601        let mut state = sys.new_diagnostics_state(now, IFACE1);
602        let view = MockInterfaceView::new(
603            IFACE1,
604            [Route {
605                destination: SUBNET_V6,
606                outbound_interface: IFACE1,
607                next_hop: Some(NEIGH_V6),
608            }],
609            [(NEIGH_V6, NEVER_HEALTHY_NEIGHBOR)],
610        );
611        // Only never healthy neighbor doesn't trigger actions.
612        assert_eq!(Watchdog::evaluate_interface_state(now, &mut state, view.view()).await, None);
613
614        // Once we have another eligible unhealthy gateway an action is
615        // triggered.
616        let view = MockInterfaceView::new(
617            IFACE1,
618            [
619                Route {
620                    destination: SUBNET_V4,
621                    outbound_interface: IFACE1,
622                    next_hop: Some(NEIGH_V4),
623                },
624                Route {
625                    destination: SUBNET_V6,
626                    outbound_interface: IFACE1,
627                    next_hop: Some(NEIGH_V6),
628                },
629            ],
630            [(NEIGH_V4, UNHEALTHY_NEIGHBOR), (NEIGH_V6, NEVER_HEALTHY_NEIGHBOR)],
631        );
632        sys.set_counters_return_timeout(IFACE1);
633        assert_eq!(
634            Watchdog::evaluate_interface_state(now, &mut state, view.view()).await,
635            Some(Action {
636                trigger_stack_diagnosis: false,
637                trigger_device_diagnosis: true,
638                reason: ActionReason::CantFetchCounters
639            })
640        );
641    }
642
643    #[fuchsia::test]
644    async fn no_action_if_one_gateway_is_healthy() {
645        let sys = MockSystem::default();
646        let now = SOME_TIME;
647        let mut state = sys.new_diagnostics_state(now, IFACE1);
648        let view = MockInterfaceView::new(
649            IFACE1,
650            [
651                Route {
652                    destination: SUBNET_V4,
653                    outbound_interface: IFACE1,
654                    next_hop: Some(NEIGH_V4),
655                },
656                Route {
657                    destination: SUBNET_V6,
658                    outbound_interface: IFACE1,
659                    next_hop: Some(NEIGH_V6),
660                },
661            ],
662            [(NEIGH_V4, UNHEALTHY_NEIGHBOR), (NEIGH_V6, HEALTHY_NEIGHBOR)],
663        );
664        assert_eq!(Watchdog::evaluate_interface_state(now, &mut state, view.view()).await, None);
665    }
666
667    #[fuchsia::test]
668    async fn actions_from_counters() {
669        let sys = MockSystem::default();
670        let now = SOME_TIME;
671        let mut state = sys.new_diagnostics_state(now, IFACE1);
672        let view = MockInterfaceView::new(
673            IFACE1,
674            [Route {
675                destination: SUBNET_V4,
676                outbound_interface: IFACE1,
677                next_hop: Some(NEIGH_V4),
678            }],
679            [(NEIGH_V4, UNHEALTHY_NEIGHBOR)],
680        );
681        let now = now + DEVICE_COUNTERS_UNHEALTHY_TIME;
682        sys.increment_counters(IFACE1, DeviceCounters { rx_frames: 10, tx_frames: 10 });
683        assert_eq!(Watchdog::evaluate_interface_state(now, &mut state, view.view()).await, None);
684
685        let now = now + DEVICE_COUNTERS_UNHEALTHY_TIME;
686        sys.increment_counters(IFACE1, DeviceCounters { rx_frames: 0, tx_frames: 10 });
687        assert_eq!(
688            Watchdog::evaluate_interface_state(now, &mut state, view.view()).await,
689            Some(Action {
690                trigger_stack_diagnosis: true,
691                trigger_device_diagnosis: true,
692                reason: ActionReason::DeviceRxStall
693            })
694        );
695        sys.increment_counters(IFACE1, DeviceCounters { rx_frames: 10, tx_frames: 0 });
696
697        let now = now + DEBUG_INFO_COOLDOWN - zx::MonotonicDuration::from_seconds(1);
698        // Don't trigger again because of cooldown.
699        assert_eq!(Watchdog::evaluate_interface_state(now, &mut state, view.view()).await, None);
700
701        // Now detect a tx stall.
702        sys.increment_counters(IFACE1, DeviceCounters { rx_frames: 10, tx_frames: 0 });
703        let now = now + zx::MonotonicDuration::from_seconds(1);
704        assert_eq!(
705            Watchdog::evaluate_interface_state(now, &mut state, view.view()).await,
706            Some(Action {
707                trigger_stack_diagnosis: true,
708                trigger_device_diagnosis: true,
709                reason: ActionReason::DeviceTxStall
710            })
711        );
712        assert_eq!(state.health, HealthStatus::Unhealthy { last_action: now });
713
714        let later = now + zx::MonotonicDuration::from_seconds(1);
715
716        // If the gateway disappears, no action is taken but we maintain the
717        // unhealthy state.
718        let view = MockInterfaceView::new(IFACE1, None, [(NEIGH_V4, HEALTHY_NEIGHBOR)]);
719        assert_eq!(Watchdog::evaluate_interface_state(later, &mut state, view.view()).await, None);
720        assert_eq!(state.health, HealthStatus::Unhealthy { last_action: now });
721
722        // Finally, if the gateway becomes healthy, the system should go back to
723        // healthy state.
724        let later = later + zx::MonotonicDuration::from_seconds(1);
725        let view = MockInterfaceView::new(
726            IFACE1,
727            [Route {
728                destination: SUBNET_V4,
729                outbound_interface: IFACE1,
730                next_hop: Some(NEIGH_V4),
731            }],
732            [(NEIGH_V4, HEALTHY_NEIGHBOR)],
733        );
734        assert_eq!(Watchdog::evaluate_interface_state(later, &mut state, view.view()).await, None);
735        assert_eq!(state.health, HealthStatus::Healthy { last_action: Some(now) });
736    }
737
738    #[fuchsia::test]
739    async fn triggers_diagnostics_requests() {
740        let sys = MockSystem::default();
741        sys.insert_interface_diagnostics(IFACE1);
742        let now = SOME_TIME;
743        let view = MockInterfaceView::new(
744            IFACE1,
745            [Route {
746                destination: SUBNET_V4,
747                outbound_interface: IFACE1,
748                next_hop: Some(NEIGH_V4),
749            }],
750            [(NEIGH_V4, UNHEALTHY_NEIGHBOR)],
751        );
752
753        let mut watchdog = Watchdog::new();
754        watchdog.check_interface_state(now, &sys, view.view()).await;
755        assert!(!sys.take_interface_debug_requested(IFACE1));
756        assert!(!sys.take_system_debug_requested());
757
758        let now = now + DEVICE_COUNTERS_UNHEALTHY_TIME;
759        watchdog.check_interface_state(now, &sys, view.view()).await;
760        assert!(sys.take_interface_debug_requested(IFACE1));
761        assert!(sys.take_system_debug_requested());
762
763        // Still unhealthy, but cooling down on debug requests.
764        let now = now + DEBUG_INFO_COOLDOWN / 2;
765        watchdog.check_interface_state(now, &sys, view.view()).await;
766        assert!(!sys.take_interface_debug_requested(IFACE1));
767        assert!(!sys.take_system_debug_requested());
768
769        let now = now + DEBUG_INFO_COOLDOWN;
770        watchdog.check_interface_state(now, &sys, view.view()).await;
771        assert!(sys.take_interface_debug_requested(IFACE1));
772        assert!(sys.take_system_debug_requested());
773    }
774
775    #[fuchsia::test]
776    fn gateway_health() {
777        let now = SOME_TIME;
778
779        // Healthy neighbor is never considered unhealthy.
780        assert_eq!(
781            GatewayHealth::from_neighbor_health(
782                &NeighborHealth::Healthy { last_observed: now },
783                now
784            ),
785            GatewayHealth::Healthy
786        );
787        assert_eq!(
788            GatewayHealth::from_neighbor_health(
789                &NeighborHealth::Healthy { last_observed: now },
790                now + zx::MonotonicDuration::from_minutes(60),
791            ),
792            GatewayHealth::Healthy
793        );
794
795        // Neighbor is unhealthy has never been healthy.
796        assert_eq!(
797            GatewayHealth::from_neighbor_health(
798                &NeighborHealth::Unhealthy { last_healthy: None },
799                now
800            ),
801            GatewayHealth::NeverHealthy
802        );
803
804        // Unhealthy neighbor is only considered unhealthy gateway after some
805        // time.
806        assert_eq!(
807            GatewayHealth::from_neighbor_health(
808                &NeighborHealth::Unhealthy { last_healthy: Some(now) },
809                now
810            ),
811            GatewayHealth::RecentlyUnhealthy
812        );
813        assert_eq!(
814            GatewayHealth::from_neighbor_health(
815                &NeighborHealth::Unhealthy { last_healthy: Some(now) },
816                now + NEIGHBOR_UNHEALTHY_TIME
817            ),
818            GatewayHealth::Unhealthy
819        );
820    }
821
822    const ZERO_TIME: zx::MonotonicInstant = zx::MonotonicInstant::from_nanos(0);
823    const SOME_TIME: zx::MonotonicInstant =
824        zx::MonotonicInstant::from_nanos(NEIGHBOR_UNHEALTHY_TIME.into_nanos());
825    const UNHEALTHY_NEIGHBOR: NeighborState =
826        NeighborState::new(NeighborHealth::Unhealthy { last_healthy: Some(ZERO_TIME) });
827    const HEALTHY_NEIGHBOR: NeighborState =
828        NeighborState::new(NeighborHealth::Healthy { last_observed: ZERO_TIME });
829
830    const IFACE1: InterfaceId = 1;
831    const NEIGH_V4: fnet::IpAddress = fidl_ip!("192.0.2.1");
832    const NEIGH_V6: fnet::IpAddress = fidl_ip!("2001:db8::1");
833    // Arbitrary subnet values with which to create routes.
834    const SUBNET_V4: fnet::Subnet = fidl_subnet!("0.0.0.0/0");
835    const SUBNET_V6: fnet::Subnet = fidl_subnet!("::0/0");
836
837    struct MockInterfaceView {
838        properties: fnet_interfaces_ext::Properties<fnet_interfaces_ext::DefaultInterest>,
839        routes: RouteTable,
840        neighbors: crate::InterfaceNeighborCache,
841    }
842
843    impl MockInterfaceView {
844        fn new<
845            R: IntoIterator<Item = Route>,
846            N: IntoIterator<Item = (fnet::IpAddress, NeighborState)>,
847        >(
848            id: InterfaceId,
849            routes: R,
850            neighbors: N,
851        ) -> Self {
852            Self {
853                properties: fnet_interfaces_ext::Properties {
854                    id: id.try_into().expect("should be nonzero"),
855                    name: "foo".to_owned(),
856                    port_class: fnet_interfaces_ext::PortClass::Loopback,
857                    online: true,
858                    addresses: vec![],
859                    has_default_ipv4_route: true,
860                    has_default_ipv6_route: true,
861                },
862                routes: testutil::build_route_table_from_flattened_routes(routes),
863                neighbors: neighbors.into_iter().collect(),
864            }
865        }
866
867        fn view(&self) -> InterfaceView<'_> {
868            let Self { properties, routes, neighbors } = self;
869            InterfaceView { properties, routes: &routes, neighbors: Some(neighbors) }
870        }
871    }
872
873    #[derive(Debug)]
874    struct MockCounterState {
875        counters_result: Option<Result<DeviceCounters, Error>>,
876        debug_requested: bool,
877    }
878
879    type MockState = Arc<Mutex<HashMap<InterfaceId, MockCounterState>>>;
880
881    type Watchdog = super::Watchdog<MockSystem>;
882
883    #[derive(Default)]
884    struct MockSystem {
885        inner: MockState,
886        debug_info_requested: std::sync::atomic::AtomicBool,
887    }
888
889    #[async_trait::async_trait]
890    impl SystemDispatcher for MockSystem {
891        type DeviceDiagnostics = MockDiagnostics;
892
893        async fn log_debug_info(&self) -> Result<(), Error> {
894            let Self { inner: _, debug_info_requested } = self;
895            debug_info_requested.store(true, std::sync::atomic::Ordering::SeqCst);
896            Ok(())
897        }
898
899        fn get_device_diagnostics(
900            &self,
901            interface: InterfaceId,
902        ) -> Result<Self::DeviceDiagnostics, Error> {
903            let Self { inner, debug_info_requested: _ } = self;
904            Ok(MockDiagnostics { inner: inner.clone(), interface })
905        }
906    }
907
908    impl MockSystem {
909        fn insert_interface_diagnostics(&self, interface: InterfaceId) {
910            let counters = DeviceCounters { rx_frames: 0, tx_frames: 0 };
911            assert_matches!(
912                self.inner.lock().insert(
913                    interface,
914                    MockCounterState {
915                        counters_result: Some(Ok(counters)),
916                        debug_requested: false
917                    }
918                ),
919                None
920            );
921        }
922
923        fn new_diagnostics_state(
924            &self,
925            now: zx::MonotonicInstant,
926            interface: InterfaceId,
927        ) -> InterfaceDiagnosticsState<MockDiagnostics> {
928            self.insert_interface_diagnostics(interface);
929            let state = Watchdog::initialize_interface_state(now, self, interface)
930                .now_or_never()
931                .expect("future should be ready")
932                .expect("failed to initialize interface state");
933
934            // Remove the initial counters to force tests that use this function
935            // to explicitly set any counter values they may wish to use.
936            self.inner.lock().get_mut(&interface).unwrap().counters_result = None;
937
938            state
939        }
940
941        fn set_counters_return_timeout(&self, interface: InterfaceId) {
942            self.inner.lock().get_mut(&interface).unwrap().counters_result =
943                Some(Err(Error::Timeout));
944        }
945
946        fn increment_counters(
947            &self,
948            interface: InterfaceId,
949            DeviceCounters { rx_frames: rx, tx_frames: tx }: DeviceCounters,
950        ) {
951            let mut state = self.inner.lock();
952            let MockCounterState { counters_result, debug_requested: _ } =
953                state.get_mut(&interface).unwrap();
954            *counters_result = Some(Ok(match counters_result {
955                Some(Ok(DeviceCounters { rx_frames, tx_frames })) => {
956                    DeviceCounters { rx_frames: *rx_frames + rx, tx_frames: *tx_frames + tx }
957                }
958                None | Some(Err(_)) => DeviceCounters { rx_frames: rx, tx_frames: tx },
959            }));
960        }
961
962        fn take_interface_debug_requested(&self, interface: InterfaceId) -> bool {
963            let mut state = self.inner.lock();
964            if let Some(MockCounterState { counters_result: _, debug_requested }) =
965                state.get_mut(&interface)
966            {
967                std::mem::replace(debug_requested, false)
968            } else {
969                false
970            }
971        }
972
973        fn take_system_debug_requested(&self) -> bool {
974            self.debug_info_requested.swap(false, std::sync::atomic::Ordering::SeqCst)
975        }
976    }
977
978    #[derive(Debug)]
979    struct MockDiagnostics {
980        inner: MockState,
981        interface: InterfaceId,
982    }
983
984    #[async_trait::async_trait]
985    impl DeviceDiagnosticsProvider for MockDiagnostics {
986        async fn get_counters(&self) -> Result<DeviceCounters, Error> {
987            let Self { inner, interface } = self;
988            let state = inner.lock();
989            state.get(interface).ok_or_else(|| Error::Fidl(fidl::Error::Invalid)).and_then(
990                |MockCounterState { counters_result, debug_requested: _ }| {
991                    counters_result.clone().expect("called get_counters on uninitialized mock")
992                },
993            )
994        }
995
996        async fn log_debug_info(&self) -> Result<(), Error> {
997            let Self { inner, interface } = self;
998            let mut state = inner.lock();
999            let MockCounterState { counters_result: _, debug_requested } =
1000                state.get_mut(interface).unwrap();
1001            *debug_requested = true;
1002            Ok(())
1003        }
1004    }
1005}