reachability_core/
watchdog.rs

1// Copyright 2022 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5//! Provides an interface health watchdog.
6//!
7//! The watchdog uses gateway neighbor reachability information and interface
8//! counters to evaluate interface health and triggers debug information dumps
9//! on the system logs when it finds unhealthy interfaces.
10
11use crate::neighbor_cache::NeighborHealth;
12use crate::{Id as InterfaceId, InterfaceView};
13use fidl_fuchsia_net_interfaces_ext as fnet_interfaces_ext;
14use itertools::Itertools as _;
15use log::{debug, error, info, warn};
16use std::collections::HashMap;
17
18/// The minimum amount of time for a device counter to be stuck in the same
19/// value for the device to be considered unhealthy.
20const DEVICE_COUNTERS_UNHEALTHY_TIME: zx::MonotonicDuration =
21    zx::MonotonicDuration::from_minutes(2);
22
23/// The minimum amount of time to wait before generating a new request for debug
24/// information.
25const DEBUG_INFO_COOLDOWN: zx::MonotonicDuration = zx::MonotonicDuration::from_minutes(15);
26
27/// The minimum amount of time for a neighbor in unhealthy state to trigger
28/// actions.
29const NEIGHBOR_UNHEALTHY_TIME: zx::MonotonicDuration = zx::MonotonicDuration::from_minutes(1);
30
31#[derive(Debug, thiserror::Error)]
32#[cfg_attr(test, derive(Clone))]
33pub enum Error {
34    #[error("Operation timed out")]
35    Timeout,
36    #[error("FIDL error {0}")]
37    Fidl(#[from] fidl::Error),
38    #[error("Unsupported operation")]
39    NotSupported,
40}
41
42#[derive(Debug)]
43#[cfg_attr(test, derive(Copy, Clone))]
44pub struct DeviceCounters {
45    pub rx_frames: u64,
46    pub tx_frames: u64,
47}
48
49#[derive(Debug)]
50#[cfg_attr(test, derive(Eq, PartialEq))]
51struct TimestampedCounter {
52    value: u64,
53    at: zx::MonotonicInstant,
54}
55
56impl TimestampedCounter {
57    fn update(&mut self, new_value: u64, new_at: zx::MonotonicInstant) -> bool {
58        let Self { value, at } = self;
59        if new_value != *value {
60            *at = new_at;
61            *value = new_value;
62            true
63        } else {
64            false
65        }
66    }
67}
68
69#[async_trait::async_trait]
70pub trait DeviceDiagnosticsProvider {
71    async fn get_counters(&self) -> Result<DeviceCounters, Error>;
72
73    async fn log_debug_info(&self) -> Result<(), Error>;
74}
75
76#[derive(Debug)]
77#[cfg_attr(test, derive(Eq, PartialEq))]
78enum HealthStatus {
79    Unhealthy { last_action: zx::MonotonicInstant },
80    Healthy { last_action: Option<zx::MonotonicInstant> },
81}
82
83impl HealthStatus {
84    /// Sets the status to unhealthy at time `now`.
85    ///
86    /// Return `true` if a debug info action should be triggered respecting
87    /// cooldown.
88    fn set_unhealthy_and_check_for_debug_info_cooldown(
89        &mut self,
90        now: zx::MonotonicInstant,
91    ) -> bool {
92        let last_action = match self {
93            HealthStatus::Unhealthy { last_action } => Some(*last_action),
94            HealthStatus::Healthy { last_action } => *last_action,
95        };
96
97        let (trigger_debug_info, last_action) = match last_action
98            .map(|last_action| (now - last_action >= DEBUG_INFO_COOLDOWN, last_action))
99        {
100            // Either we haven't yet triggered a debug info action or we've
101            // passed the cooldown period.
102            None | Some((true, _)) => (true, now),
103            // We're still in the cooldown period from the last triggered
104            // action.
105            Some((false, last_action)) => (false, last_action),
106        };
107
108        *self = HealthStatus::Unhealthy { last_action: last_action };
109
110        return trigger_debug_info;
111    }
112
113    /// Sets the system health status to healthy.
114    fn set_healthy(&mut self) {
115        match self {
116            HealthStatus::Unhealthy { last_action } => {
117                *self = HealthStatus::Healthy { last_action: Some(*last_action) };
118            }
119            HealthStatus::Healthy { last_action: _ } => {}
120        }
121    }
122}
123
124#[derive(Debug)]
125struct InterfaceDiagnosticsState<D> {
126    diagnostics: D,
127    rx: TimestampedCounter,
128    tx: TimestampedCounter,
129    updated_at: zx::MonotonicInstant,
130    health: HealthStatus,
131}
132
133#[derive(Debug)]
134struct InterfaceState<D> {
135    diagnostics_state: Option<InterfaceDiagnosticsState<D>>,
136}
137
138pub struct Watchdog<S: SystemDispatcher> {
139    interfaces: HashMap<InterfaceId, InterfaceState<S::DeviceDiagnostics>>,
140    system_health_status: HealthStatus,
141    _marker: std::marker::PhantomData<S>,
142}
143
144#[async_trait::async_trait]
145pub trait SystemDispatcher {
146    type DeviceDiagnostics: DeviceDiagnosticsProvider;
147
148    async fn log_debug_info(&self) -> Result<(), Error>;
149
150    fn get_device_diagnostics(
151        &self,
152        interface: InterfaceId,
153    ) -> Result<Self::DeviceDiagnostics, Error>;
154}
155
156#[derive(Debug, Eq, PartialEq, Clone, Copy)]
157enum ActionReason {
158    CantFetchCounters,
159    DeviceRxStall,
160    DeviceTxStall,
161}
162
163#[derive(Debug, Eq, PartialEq)]
164struct Action {
165    trigger_stack_diagnosis: bool,
166    trigger_device_diagnosis: bool,
167    reason: ActionReason,
168}
169
170impl<S> Watchdog<S>
171where
172    S: SystemDispatcher,
173{
174    pub fn new() -> Self {
175        Self {
176            interfaces: HashMap::new(),
177            system_health_status: HealthStatus::Healthy { last_action: None },
178            _marker: std::marker::PhantomData,
179        }
180    }
181
182    async fn initialize_interface_state(
183        now: zx::MonotonicInstant,
184        sys: &S,
185        interface: InterfaceId,
186    ) -> Option<InterfaceDiagnosticsState<S::DeviceDiagnostics>> {
187        // Get a diagnostics handle and read the initial counters.
188        let diagnostics = match sys.get_device_diagnostics(interface) {
189            Ok(d) => d,
190            Err(e) => {
191                warn!(
192                    err:? = e,
193                    iface = interface;
194                    "failed to read diagnostics state, assuming unsupported interface"
195                );
196                return None;
197            }
198        };
199        let DeviceCounters { rx_frames, tx_frames } = match diagnostics.get_counters().await {
200            Ok(c) => c,
201            Err(e) => {
202                warn!(
203                    err:? = e,
204                    iface = interface;
205                    "failed to read device counters, assuming unsupported interface"
206                );
207                return None;
208            }
209        };
210        Some(InterfaceDiagnosticsState {
211            diagnostics,
212            rx: TimestampedCounter { value: rx_frames, at: now },
213            tx: TimestampedCounter { value: tx_frames, at: now },
214            updated_at: now,
215            health: HealthStatus::Healthy { last_action: None },
216        })
217    }
218
219    pub async fn check_interface_state(
220        &mut self,
221        now: zx::MonotonicInstant,
222        sys: &S,
223        view: InterfaceView<'_>,
224    ) {
225        debug!(view:? = view; "poll interface state");
226        let Self { interfaces, system_health_status, _marker: _ } = self;
227
228        let interface = view.properties.id;
229
230        let InterfaceState { diagnostics_state } = match interfaces.entry(interface.get()) {
231            std::collections::hash_map::Entry::Occupied(entry) => entry.into_mut(),
232            std::collections::hash_map::Entry::Vacant(vacant) => vacant.insert(InterfaceState {
233                diagnostics_state: Self::initialize_interface_state(now, sys, interface.get())
234                    .await,
235            }),
236        };
237
238        let diagnostics_state = if let Some(d) = diagnostics_state.as_mut() {
239            d
240        } else {
241            // Do nothing for unsupported interfaces, we can't get counters or
242            // trigger debug info on them.
243            return;
244        };
245
246        if let Some(action) = Self::evaluate_interface_state(now, diagnostics_state, view).await {
247            info!(
248                action:? = action,
249                iface = interface;
250                "bad state detected, action requested"
251            );
252            let Action { trigger_stack_diagnosis, trigger_device_diagnosis, reason: _ } = action;
253            if trigger_device_diagnosis {
254                diagnostics_state.diagnostics.log_debug_info().await.unwrap_or_else(
255                    |e| error!(err:? = e, iface = interface; "failed to request device debug info"),
256                );
257            }
258            if trigger_stack_diagnosis {
259                if system_health_status.set_unhealthy_and_check_for_debug_info_cooldown(now) {
260                    sys.log_debug_info().await.unwrap_or_else(
261                        |e| error!(err:? = e; "failed to request system debug info"),
262                    );
263                }
264            }
265        }
266    }
267
268    /// Evaluates the given interface state, returning an optional debugging
269    /// action to be triggered.
270    ///
271    /// Interfaces are evaluated at two levels. First, all the gateways are
272    /// evaluated against the neighbor table. Second, if all gateways are
273    /// unhealthy, the device counters are polled until a stall is observed. If
274    /// an Rx or Tx stall is seen, a debug action will be requested.
275    ///
276    /// If there's a timeout attempting to fetch interface counters, a debug
277    /// request may also be issued.
278    async fn evaluate_interface_state(
279        now: zx::MonotonicInstant,
280        diag_state: &mut InterfaceDiagnosticsState<S::DeviceDiagnostics>,
281        InterfaceView {
282            properties: fnet_interfaces_ext::Properties { id: interface, .. },
283            routes,
284            neighbors,
285        }: InterfaceView<'_>,
286    ) -> Option<Action> {
287        let InterfaceDiagnosticsState { diagnostics, rx, tx, updated_at, health } = diag_state;
288        let interface = *interface;
289
290        debug!(iface = interface; "evaluate interface state");
291
292        let mut neighbors = neighbors.as_ref()?.iter_health();
293        let found_healthy_gateway = neighbors
294            .fold_while(None, |found_healthy_gateway, (neighbor, health)| {
295                let is_router = routes.device_routes(interface.get()).any(|route| {
296                    route.next_hop.map(|next_hop| *neighbor == next_hop).unwrap_or(false)
297                });
298
299                if !is_router {
300                    return itertools::FoldWhile::Continue(found_healthy_gateway);
301                }
302
303                let gateway_health = GatewayHealth::from_neighbor_health(health, now);
304                debug!(
305                    iface = interface,
306                    neighbor:? = fidl_fuchsia_net_ext::IpAddress::from(neighbor.clone()),
307                    health:? = gateway_health;
308                    "router check"
309                );
310                match gateway_health {
311                    // When we find a healthy neighbor, immediately break the
312                    // fold.
313                    GatewayHealth::Healthy
314                    // A gateway that hasn't been unhealthy for a long time may
315                    // only be going through a temporary outage.
316                    | GatewayHealth::RecentlyUnhealthy
317                    // Unknown gateway state is assumed to be healthy. Expected
318                    // to shift once neighbor table fills up.
319                    | GatewayHealth::Unknown
320                    => {
321                        itertools::FoldWhile::Done(Some(true))
322                    }
323                    // A gateway that was never healthy is considered a
324                    // misconfiguration and should not trip the watchdog.
325                    // Skip it entirely so it's not considered for the search.
326                    | GatewayHealth::NeverHealthy => {
327                        itertools::FoldWhile::Continue(found_healthy_gateway)
328                    }
329                    GatewayHealth::Unhealthy => itertools::FoldWhile::Continue(Some(false)),
330                }
331            })
332            .into_inner();
333
334        match found_healthy_gateway {
335            // If there are no gateways, there's not much we can do. Assume that
336            // either the interface is not configured for upstream connectivity
337            // or we're going through a link flap event.
338            None => {
339                debug!(iface = interface; "no gateway in neighbors");
340                return None;
341            }
342            // If there's at least one healthy gateway, there's no action to be
343            // taken, but we can mark the interface as healthy.
344            Some(true) => {
345                debug!(iface = interface; "neighbors are healthy");
346                health.set_healthy();
347                return None;
348            }
349            // If we found at least one gateway and they're all unhealthy,
350            // proceed to check device counters.
351            Some(false) => (),
352        }
353
354        let counters = match diagnostics.get_counters().await {
355            Ok(counters) => counters,
356            Err(Error::Timeout) => {
357                return Some(Action {
358                    trigger_stack_diagnosis: false,
359                    trigger_device_diagnosis: true,
360                    reason: ActionReason::CantFetchCounters,
361                });
362            }
363            Err(Error::Fidl(e)) => {
364                if !e.is_closed() {
365                    error!(
366                        e:? = e,
367                        iface = interface;
368                        "failed to read counters for interface, no action will be taken"
369                    );
370                }
371                return None;
372            }
373            Err(Error::NotSupported) => {
374                error!(
375                    iface = interface;
376                    "failed to read counters for interface, no action will be taken"
377                );
378                return None;
379            }
380        };
381        let DeviceCounters { rx_frames, tx_frames } = counters;
382        if !rx.update(rx_frames, now) {
383            warn!(
384                rx:? = rx,
385                now = now.into_nanos(),
386                iface = interface;
387                "failed to observe rx traffic since last check"
388            );
389        }
390        if !tx.update(tx_frames, now) {
391            warn!(
392                tx:? = tx,
393                now = now.into_nanos(),
394                iface = interface;
395                "failed to observe tx traffic since last check"
396            );
397        }
398        *updated_at = now;
399        if let Some(reason) = [(rx, ActionReason::DeviceRxStall), (tx, ActionReason::DeviceTxStall)]
400            .iter()
401            .find_map(|(TimestampedCounter { value: _, at }, reason)| {
402                (now - *at >= DEVICE_COUNTERS_UNHEALTHY_TIME).then_some(*reason)
403            })
404        {
405            let action = health.set_unhealthy_and_check_for_debug_info_cooldown(now).then_some({
406                Action { trigger_stack_diagnosis: true, trigger_device_diagnosis: true, reason }
407            });
408
409            return action;
410        }
411
412        info!(
413            iface = interface,
414            rx = rx_frames,
415            tx = tx_frames;
416            "gateways are unhealthy, but counters are healthy."
417        );
418
419        // Counters are not stalled, mark the interface as healthy.
420        health.set_healthy();
421
422        None
423    }
424
425    pub fn handle_interface_removed(&mut self, interface: InterfaceId) {
426        let Self { interfaces, system_health_status: _, _marker: _ } = self;
427        match interfaces.remove(&interface) {
428            Some(InterfaceState { .. }) => (),
429            None => error!(iface = interface; "attempted to remove unknown interface"),
430        }
431    }
432}
433
434#[derive(Debug, PartialEq, Eq)]
435enum GatewayHealth {
436    Unknown,
437    Healthy,
438    RecentlyUnhealthy,
439    Unhealthy,
440    NeverHealthy,
441}
442
443impl GatewayHealth {
444    /// Checks if a gateway with reported `health` should be considered healthy.
445    fn from_neighbor_health(health: &NeighborHealth, now: zx::MonotonicInstant) -> Self {
446        match health {
447            NeighborHealth::Unknown => Self::Unknown,
448            NeighborHealth::Healthy { last_observed: _ } => Self::Healthy,
449            NeighborHealth::Unhealthy { last_healthy: None } => Self::NeverHealthy,
450            NeighborHealth::Unhealthy { last_healthy: Some(last_healthy) } => {
451                if now - *last_healthy < NEIGHBOR_UNHEALTHY_TIME {
452                    Self::RecentlyUnhealthy
453                } else {
454                    Self::Unhealthy
455                }
456            }
457        }
458    }
459}
460
461#[cfg(test)]
462mod tests {
463    use super::*;
464
465    use crate::route_table::Route;
466    use assert_matches::assert_matches;
467    use fidl_fuchsia_net as fnet;
468    use futures::FutureExt as _;
469    use net_declare::{fidl_ip, fidl_subnet};
470    use std::sync::{Arc, Mutex};
471
472    use crate::neighbor_cache::NeighborState;
473    use crate::route_table::RouteTable;
474    use crate::testutil;
475
476    #[test]
477    fn health_status_healthy() {
478        let now = SOME_TIME;
479        let mut status = HealthStatus::Healthy { last_action: None };
480        assert!(status.set_unhealthy_and_check_for_debug_info_cooldown(now));
481        assert_eq!(status, HealthStatus::Unhealthy { last_action: now });
482
483        status = HealthStatus::Healthy { last_action: Some(now) };
484        let later = now + zx::MonotonicDuration::from_seconds(1);
485        assert!(!status.set_unhealthy_and_check_for_debug_info_cooldown(later));
486        assert_eq!(status, HealthStatus::Unhealthy { last_action: now });
487
488        status = HealthStatus::Healthy { last_action: Some(now) };
489        let later = now + DEBUG_INFO_COOLDOWN;
490        assert!(status.set_unhealthy_and_check_for_debug_info_cooldown(later));
491        assert_eq!(status, HealthStatus::Unhealthy { last_action: later });
492    }
493
494    #[test]
495    fn health_status_unhealthy() {
496        let now = SOME_TIME;
497        let mut status = HealthStatus::Unhealthy { last_action: now };
498        let later = now + zx::MonotonicDuration::from_seconds(1);
499        assert!(!status.set_unhealthy_and_check_for_debug_info_cooldown(later));
500        assert_eq!(status, HealthStatus::Unhealthy { last_action: now });
501
502        let later = now + DEBUG_INFO_COOLDOWN;
503        assert!(status.set_unhealthy_and_check_for_debug_info_cooldown(later));
504        assert_eq!(status, HealthStatus::Unhealthy { last_action: later });
505    }
506
507    #[test]
508    fn timestamped_counter() {
509        let now = SOME_TIME;
510        let mut counter = TimestampedCounter { value: 1, at: now };
511
512        let later = now + zx::MonotonicDuration::from_seconds(1);
513        assert!(!counter.update(1, later));
514        assert_eq!(counter, TimestampedCounter { value: 1, at: now });
515
516        assert!(counter.update(2, later));
517        assert_eq!(counter, TimestampedCounter { value: 2, at: later });
518    }
519
520    #[fuchsia::test]
521    async fn initialize_interface_state() {
522        let now = SOME_TIME;
523
524        let sys = MockSystem::default();
525        assert_matches!(Watchdog::initialize_interface_state(now, &sys, IFACE1).await, None);
526
527        let counters = DeviceCounters { rx_frames: 1, tx_frames: 2 };
528        sys.insert_interface_diagnostics(IFACE1);
529        sys.increment_counters(IFACE1, counters.clone());
530
531        let InterfaceDiagnosticsState { diagnostics: _, rx, tx, updated_at, health } =
532            Watchdog::initialize_interface_state(now, &sys, IFACE1)
533                .await
534                .expect("failed to init interface");
535        assert_eq!(rx, TimestampedCounter { value: counters.rx_frames, at: now });
536        assert_eq!(tx, TimestampedCounter { value: counters.tx_frames, at: now });
537        assert_eq!(updated_at, now);
538        assert_eq!(health, HealthStatus::Healthy { last_action: None });
539    }
540
541    #[fuchsia::test]
542    async fn no_action_if_no_neighbors() {
543        let sys = MockSystem::default();
544        let now = SOME_TIME;
545        let mut state = sys.new_diagnostics_state(now, IFACE1);
546        let view = MockInterfaceView::new(IFACE1, None, None);
547        assert_eq!(Watchdog::evaluate_interface_state(now, &mut state, view.view()).await, None);
548        assert_eq!(
549            Watchdog::evaluate_interface_state(
550                now,
551                &mut state,
552                InterfaceView { neighbors: None, ..view.view() }
553            )
554            .await,
555            None
556        );
557    }
558
559    #[fuchsia::test]
560    async fn no_action_if_unreachable_neighbor_isnt_gateway() {
561        let sys = MockSystem::default();
562        let now = SOME_TIME;
563        let mut state = sys.new_diagnostics_state(now, IFACE1);
564        let view = MockInterfaceView::new(IFACE1, None, [(NEIGH_V4, UNHEALTHY_NEIGHBOR)]);
565        assert_eq!(Watchdog::evaluate_interface_state(now, &mut state, view.view()).await, None);
566    }
567
568    #[fuchsia::test]
569    async fn poll_counters_if_neighbor_is_gateway() {
570        let sys = MockSystem::default();
571        let now = SOME_TIME;
572        let mut state = sys.new_diagnostics_state(now, IFACE1);
573        let view = MockInterfaceView::new(
574            IFACE1,
575            [Route {
576                destination: SUBNET_V4,
577                outbound_interface: IFACE1,
578                next_hop: Some(NEIGH_V4),
579            }],
580            [(NEIGH_V4, UNHEALTHY_NEIGHBOR)],
581        );
582        sys.set_counters_return_timeout(IFACE1);
583        assert_eq!(
584            Watchdog::evaluate_interface_state(now, &mut state, view.view()).await,
585            Some(Action {
586                trigger_stack_diagnosis: false,
587                trigger_device_diagnosis: true,
588                reason: ActionReason::CantFetchCounters
589            })
590        );
591    }
592
593    #[fuchsia::test]
594    async fn ignore_never_healthy_neighbors() {
595        const NEVER_HEALTHY_NEIGHBOR: NeighborState =
596            NeighborState::new(NeighborHealth::Unhealthy { last_healthy: None });
597
598        let sys = MockSystem::default();
599        let now = SOME_TIME;
600        let mut state = sys.new_diagnostics_state(now, IFACE1);
601        let view = MockInterfaceView::new(
602            IFACE1,
603            [Route {
604                destination: SUBNET_V6,
605                outbound_interface: IFACE1,
606                next_hop: Some(NEIGH_V6),
607            }],
608            [(NEIGH_V6, NEVER_HEALTHY_NEIGHBOR)],
609        );
610        // Only never healthy neighbor doesn't trigger actions.
611        assert_eq!(Watchdog::evaluate_interface_state(now, &mut state, view.view()).await, None);
612
613        // Once we have another eligible unhealthy gateway an action is
614        // triggered.
615        let view = MockInterfaceView::new(
616            IFACE1,
617            [
618                Route {
619                    destination: SUBNET_V4,
620                    outbound_interface: IFACE1,
621                    next_hop: Some(NEIGH_V4),
622                },
623                Route {
624                    destination: SUBNET_V6,
625                    outbound_interface: IFACE1,
626                    next_hop: Some(NEIGH_V6),
627                },
628            ],
629            [(NEIGH_V4, UNHEALTHY_NEIGHBOR), (NEIGH_V6, NEVER_HEALTHY_NEIGHBOR)],
630        );
631        sys.set_counters_return_timeout(IFACE1);
632        assert_eq!(
633            Watchdog::evaluate_interface_state(now, &mut state, view.view()).await,
634            Some(Action {
635                trigger_stack_diagnosis: false,
636                trigger_device_diagnosis: true,
637                reason: ActionReason::CantFetchCounters
638            })
639        );
640    }
641
642    #[fuchsia::test]
643    async fn no_action_if_one_gateway_is_healthy() {
644        let sys = MockSystem::default();
645        let now = SOME_TIME;
646        let mut state = sys.new_diagnostics_state(now, IFACE1);
647        let view = MockInterfaceView::new(
648            IFACE1,
649            [
650                Route {
651                    destination: SUBNET_V4,
652                    outbound_interface: IFACE1,
653                    next_hop: Some(NEIGH_V4),
654                },
655                Route {
656                    destination: SUBNET_V6,
657                    outbound_interface: IFACE1,
658                    next_hop: Some(NEIGH_V6),
659                },
660            ],
661            [(NEIGH_V4, UNHEALTHY_NEIGHBOR), (NEIGH_V6, HEALTHY_NEIGHBOR)],
662        );
663        assert_eq!(Watchdog::evaluate_interface_state(now, &mut state, view.view()).await, None);
664    }
665
666    #[fuchsia::test]
667    async fn actions_from_counters() {
668        let sys = MockSystem::default();
669        let now = SOME_TIME;
670        let mut state = sys.new_diagnostics_state(now, IFACE1);
671        let view = MockInterfaceView::new(
672            IFACE1,
673            [Route {
674                destination: SUBNET_V4,
675                outbound_interface: IFACE1,
676                next_hop: Some(NEIGH_V4),
677            }],
678            [(NEIGH_V4, UNHEALTHY_NEIGHBOR)],
679        );
680        let now = now + DEVICE_COUNTERS_UNHEALTHY_TIME;
681        sys.increment_counters(IFACE1, DeviceCounters { rx_frames: 10, tx_frames: 10 });
682        assert_eq!(Watchdog::evaluate_interface_state(now, &mut state, view.view()).await, None);
683
684        let now = now + DEVICE_COUNTERS_UNHEALTHY_TIME;
685        sys.increment_counters(IFACE1, DeviceCounters { rx_frames: 0, tx_frames: 10 });
686        assert_eq!(
687            Watchdog::evaluate_interface_state(now, &mut state, view.view()).await,
688            Some(Action {
689                trigger_stack_diagnosis: true,
690                trigger_device_diagnosis: true,
691                reason: ActionReason::DeviceRxStall
692            })
693        );
694        sys.increment_counters(IFACE1, DeviceCounters { rx_frames: 10, tx_frames: 0 });
695
696        let now = now + DEBUG_INFO_COOLDOWN - zx::MonotonicDuration::from_seconds(1);
697        // Don't trigger again because of cooldown.
698        assert_eq!(Watchdog::evaluate_interface_state(now, &mut state, view.view()).await, None);
699
700        // Now detect a tx stall.
701        sys.increment_counters(IFACE1, DeviceCounters { rx_frames: 10, tx_frames: 0 });
702        let now = now + zx::MonotonicDuration::from_seconds(1);
703        assert_eq!(
704            Watchdog::evaluate_interface_state(now, &mut state, view.view()).await,
705            Some(Action {
706                trigger_stack_diagnosis: true,
707                trigger_device_diagnosis: true,
708                reason: ActionReason::DeviceTxStall
709            })
710        );
711        assert_eq!(state.health, HealthStatus::Unhealthy { last_action: now });
712
713        let later = now + zx::MonotonicDuration::from_seconds(1);
714
715        // If the gateway disappears, no action is taken but we maintain the
716        // unhealthy state.
717        let view = MockInterfaceView::new(IFACE1, None, [(NEIGH_V4, HEALTHY_NEIGHBOR)]);
718        assert_eq!(Watchdog::evaluate_interface_state(later, &mut state, view.view()).await, None);
719        assert_eq!(state.health, HealthStatus::Unhealthy { last_action: now });
720
721        // Finally, if the gateway becomes healthy, the system should go back to
722        // healthy state.
723        let later = later + zx::MonotonicDuration::from_seconds(1);
724        let view = MockInterfaceView::new(
725            IFACE1,
726            [Route {
727                destination: SUBNET_V4,
728                outbound_interface: IFACE1,
729                next_hop: Some(NEIGH_V4),
730            }],
731            [(NEIGH_V4, HEALTHY_NEIGHBOR)],
732        );
733        assert_eq!(Watchdog::evaluate_interface_state(later, &mut state, view.view()).await, None);
734        assert_eq!(state.health, HealthStatus::Healthy { last_action: Some(now) });
735    }
736
737    #[fuchsia::test]
738    async fn triggers_diagnostics_requests() {
739        let sys = MockSystem::default();
740        sys.insert_interface_diagnostics(IFACE1);
741        let now = SOME_TIME;
742        let view = MockInterfaceView::new(
743            IFACE1,
744            [Route {
745                destination: SUBNET_V4,
746                outbound_interface: IFACE1,
747                next_hop: Some(NEIGH_V4),
748            }],
749            [(NEIGH_V4, UNHEALTHY_NEIGHBOR)],
750        );
751
752        let mut watchdog = Watchdog::new();
753        watchdog.check_interface_state(now, &sys, view.view()).await;
754        assert!(!sys.take_interface_debug_requested(IFACE1));
755        assert!(!sys.take_system_debug_requested());
756
757        let now = now + DEVICE_COUNTERS_UNHEALTHY_TIME;
758        watchdog.check_interface_state(now, &sys, view.view()).await;
759        assert!(sys.take_interface_debug_requested(IFACE1));
760        assert!(sys.take_system_debug_requested());
761
762        // Still unhealthy, but cooling down on debug requests.
763        let now = now + DEBUG_INFO_COOLDOWN / 2;
764        watchdog.check_interface_state(now, &sys, view.view()).await;
765        assert!(!sys.take_interface_debug_requested(IFACE1));
766        assert!(!sys.take_system_debug_requested());
767
768        let now = now + DEBUG_INFO_COOLDOWN;
769        watchdog.check_interface_state(now, &sys, view.view()).await;
770        assert!(sys.take_interface_debug_requested(IFACE1));
771        assert!(sys.take_system_debug_requested());
772    }
773
774    #[fuchsia::test]
775    fn gateway_health() {
776        let now = SOME_TIME;
777
778        // Healthy neighbor is never considered unhealthy.
779        assert_eq!(
780            GatewayHealth::from_neighbor_health(
781                &NeighborHealth::Healthy { last_observed: now },
782                now
783            ),
784            GatewayHealth::Healthy
785        );
786        assert_eq!(
787            GatewayHealth::from_neighbor_health(
788                &NeighborHealth::Healthy { last_observed: now },
789                now + zx::MonotonicDuration::from_minutes(60),
790            ),
791            GatewayHealth::Healthy
792        );
793
794        // Neighbor is unhealthy has never been healthy.
795        assert_eq!(
796            GatewayHealth::from_neighbor_health(
797                &NeighborHealth::Unhealthy { last_healthy: None },
798                now
799            ),
800            GatewayHealth::NeverHealthy
801        );
802
803        // Unhealthy neighbor is only considered unhealthy gateway after some
804        // time.
805        assert_eq!(
806            GatewayHealth::from_neighbor_health(
807                &NeighborHealth::Unhealthy { last_healthy: Some(now) },
808                now
809            ),
810            GatewayHealth::RecentlyUnhealthy
811        );
812        assert_eq!(
813            GatewayHealth::from_neighbor_health(
814                &NeighborHealth::Unhealthy { last_healthy: Some(now) },
815                now + NEIGHBOR_UNHEALTHY_TIME
816            ),
817            GatewayHealth::Unhealthy
818        );
819    }
820
821    const ZERO_TIME: zx::MonotonicInstant = zx::MonotonicInstant::from_nanos(0);
822    const SOME_TIME: zx::MonotonicInstant =
823        zx::MonotonicInstant::from_nanos(NEIGHBOR_UNHEALTHY_TIME.into_nanos());
824    const UNHEALTHY_NEIGHBOR: NeighborState =
825        NeighborState::new(NeighborHealth::Unhealthy { last_healthy: Some(ZERO_TIME) });
826    const HEALTHY_NEIGHBOR: NeighborState =
827        NeighborState::new(NeighborHealth::Healthy { last_observed: ZERO_TIME });
828
829    const IFACE1: InterfaceId = 1;
830    const NEIGH_V4: fnet::IpAddress = fidl_ip!("192.0.2.1");
831    const NEIGH_V6: fnet::IpAddress = fidl_ip!("2001:db8::1");
832    // Arbitrary subnet values with which to create routes.
833    const SUBNET_V4: fnet::Subnet = fidl_subnet!("0.0.0.0/0");
834    const SUBNET_V6: fnet::Subnet = fidl_subnet!("::0/0");
835
836    struct MockInterfaceView {
837        properties: fnet_interfaces_ext::Properties<fnet_interfaces_ext::DefaultInterest>,
838        routes: RouteTable,
839        neighbors: crate::InterfaceNeighborCache,
840    }
841
842    impl MockInterfaceView {
843        fn new<
844            R: IntoIterator<Item = Route>,
845            N: IntoIterator<Item = (fnet::IpAddress, NeighborState)>,
846        >(
847            id: InterfaceId,
848            routes: R,
849            neighbors: N,
850        ) -> Self {
851            Self {
852                properties: fnet_interfaces_ext::Properties {
853                    id: id.try_into().expect("should be nonzero"),
854                    name: "foo".to_owned(),
855                    port_class: fnet_interfaces_ext::PortClass::Loopback,
856                    online: true,
857                    addresses: vec![],
858                    has_default_ipv4_route: true,
859                    has_default_ipv6_route: true,
860                },
861                routes: testutil::build_route_table_from_flattened_routes(routes),
862                neighbors: neighbors.into_iter().collect(),
863            }
864        }
865
866        fn view(&self) -> InterfaceView<'_> {
867            let Self { properties, routes, neighbors } = self;
868            InterfaceView { properties, routes: &routes, neighbors: Some(neighbors) }
869        }
870    }
871
872    #[derive(Debug)]
873    struct MockCounterState {
874        counters_result: Option<Result<DeviceCounters, Error>>,
875        debug_requested: bool,
876    }
877
878    type MockState = Arc<Mutex<HashMap<InterfaceId, MockCounterState>>>;
879
880    type Watchdog = super::Watchdog<MockSystem>;
881
882    #[derive(Default)]
883    struct MockSystem {
884        inner: MockState,
885        debug_info_requested: std::sync::atomic::AtomicBool,
886    }
887
888    #[async_trait::async_trait]
889    impl SystemDispatcher for MockSystem {
890        type DeviceDiagnostics = MockDiagnostics;
891
892        async fn log_debug_info(&self) -> Result<(), Error> {
893            let Self { inner: _, debug_info_requested } = self;
894            debug_info_requested.store(true, std::sync::atomic::Ordering::SeqCst);
895            Ok(())
896        }
897
898        fn get_device_diagnostics(
899            &self,
900            interface: InterfaceId,
901        ) -> Result<Self::DeviceDiagnostics, Error> {
902            let Self { inner, debug_info_requested: _ } = self;
903            Ok(MockDiagnostics { inner: inner.clone(), interface })
904        }
905    }
906
907    impl MockSystem {
908        fn insert_interface_diagnostics(&self, interface: InterfaceId) {
909            let counters = DeviceCounters { rx_frames: 0, tx_frames: 0 };
910            assert_matches!(
911                self.inner.lock().unwrap().insert(
912                    interface,
913                    MockCounterState {
914                        counters_result: Some(Ok(counters)),
915                        debug_requested: false
916                    }
917                ),
918                None
919            );
920        }
921
922        fn new_diagnostics_state(
923            &self,
924            now: zx::MonotonicInstant,
925            interface: InterfaceId,
926        ) -> InterfaceDiagnosticsState<MockDiagnostics> {
927            self.insert_interface_diagnostics(interface);
928            let state = Watchdog::initialize_interface_state(now, self, interface)
929                .now_or_never()
930                .expect("future should be ready")
931                .expect("failed to initialize interface state");
932
933            // Remove the initial counters to force tests that use this function
934            // to explicitly set any counter values they may wish to use.
935            self.inner.lock().unwrap().get_mut(&interface).unwrap().counters_result = None;
936
937            state
938        }
939
940        fn set_counters_return_timeout(&self, interface: InterfaceId) {
941            self.inner.lock().unwrap().get_mut(&interface).unwrap().counters_result =
942                Some(Err(Error::Timeout));
943        }
944
945        fn increment_counters(
946            &self,
947            interface: InterfaceId,
948            DeviceCounters { rx_frames: rx, tx_frames: tx }: DeviceCounters,
949        ) {
950            let mut state = self.inner.lock().unwrap();
951            let MockCounterState { counters_result, debug_requested: _ } =
952                state.get_mut(&interface).unwrap();
953            *counters_result = Some(Ok(match counters_result {
954                Some(Ok(DeviceCounters { rx_frames, tx_frames })) => {
955                    DeviceCounters { rx_frames: *rx_frames + rx, tx_frames: *tx_frames + tx }
956                }
957                None | Some(Err(_)) => DeviceCounters { rx_frames: rx, tx_frames: tx },
958            }));
959        }
960
961        fn take_interface_debug_requested(&self, interface: InterfaceId) -> bool {
962            let mut state = self.inner.lock().unwrap();
963            if let Some(MockCounterState { counters_result: _, debug_requested }) =
964                state.get_mut(&interface)
965            {
966                std::mem::replace(debug_requested, false)
967            } else {
968                false
969            }
970        }
971
972        fn take_system_debug_requested(&self) -> bool {
973            self.debug_info_requested.swap(false, std::sync::atomic::Ordering::SeqCst)
974        }
975    }
976
977    #[derive(Debug)]
978    struct MockDiagnostics {
979        inner: MockState,
980        interface: InterfaceId,
981    }
982
983    #[async_trait::async_trait]
984    impl DeviceDiagnosticsProvider for MockDiagnostics {
985        async fn get_counters(&self) -> Result<DeviceCounters, Error> {
986            let Self { inner, interface } = self;
987            let state = inner.lock().unwrap();
988            state.get(interface).ok_or_else(|| Error::Fidl(fidl::Error::Invalid)).and_then(
989                |MockCounterState { counters_result, debug_requested: _ }| {
990                    counters_result.clone().expect("called get_counters on uninitialized mock")
991                },
992            )
993        }
994
995        async fn log_debug_info(&self) -> Result<(), Error> {
996            let Self { inner, interface } = self;
997            let mut state = inner.lock().unwrap();
998            let MockCounterState { counters_result: _, debug_requested } =
999                state.get_mut(interface).unwrap();
1000            *debug_requested = true;
1001            Ok(())
1002        }
1003    }
1004}