system_update_committer/
main.rs

1// Copyright 2020 The Fuchsia Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#![allow(clippy::let_unit_value)]
6
7use ::fidl::endpoints::RequestStream as _;
8use anyhow::{anyhow, Context as _, Error};
9use fidl_fuchsia_update_verify::HealthVerificationMarker;
10use fuchsia_component::client::connect_to_protocol;
11use fuchsia_component::server::ServiceFs;
12use fuchsia_inspect::health::Reporter as _;
13use fuchsia_inspect::{self as finspect};
14use futures::channel::oneshot;
15use futures::future::{FutureExt as _, TryFutureExt as _};
16use futures::stream::StreamExt as _;
17use log::{error, info, warn};
18use std::sync::Arc;
19use zx::HandleBased as _;
20use {
21    fidl_fuchsia_component_sandbox as fsandbox, fidl_fuchsia_io as fio,
22    fidl_fuchsia_paver as fpaver, fidl_fuchsia_process_lifecycle as flifecycle,
23    fidl_fuchsia_update as fupdate, fuchsia_async as fasync,
24};
25
26mod fidl;
27mod metadata;
28mod reboot;
29
30// The feedback component persists the reboot reason. It obtains the reboot reason by registering a
31// watcher with the power manager, so we delay reboot slightly to give the feedback component a
32// chance to obtain and persist the reboot reason.
33const MINIMUM_REBOOT_WAIT: std::time::Duration = std::time::Duration::from_secs(5);
34
35#[fuchsia::main(logging_tags = ["system-update-committer"])]
36pub fn main() -> Result<(), Error> {
37    info!("starting system-update-committer");
38
39    let mut executor = fasync::LocalExecutor::new();
40    let () = executor.run_singlethreaded(main_async()).map_err(|err| {
41        // Use anyhow to print the error chain.
42        let err = anyhow!(err);
43        error!("error running system-update-committer: {:#}", err);
44        err
45    })?;
46
47    info!("stopping system-update-committer");
48    Ok(())
49}
50
51async fn main_async() -> Result<(), Error> {
52    match fuchsia_runtime::take_startup_handle(fuchsia_runtime::HandleInfo::new(
53        fuchsia_runtime::HandleType::EscrowedDictionary,
54        0,
55    )) {
56        Some(dictionary) => {
57            resume_from_escrow(fsandbox::DictionaryRef { token: dictionary.into() })
58                .await
59                .context("resume_from_idle_stop")
60        }
61        None => fresh_run().await.context("first_run"),
62    }
63}
64
65struct EscrowState {
66    // Keep the internal pair alive so clients can't observe EVENTPAIR_PEER_CLOSED.
67    p_internal: ::fidl::Handle,
68    p_external: ::fidl::Handle,
69    // Inspect fails open so in rare cases it may not be available.
70    frozen_inspect: Option<::fidl::Handle>,
71}
72
73// It should not be possible for this function to be called more than once per boot in prod because:
74//   1. it is only called when the component is not supplied with escrowed state
75//   2. CM guarantees that escrowed state will always be supplied when restarting a component
76//   3. the component always* escrows state when exiting cleanly
77//   4. the component is `on_terminate: "reboot"` and so unclean exits should trigger reboot
78//
79//   * If the component receives a fuchsia.process.lifecycle/LifeCycle.Stop message it will exit
80//     cleanly without escrowing state (see https://fxbug.dev/332341289), but because the component
81//     is not a dynamic component this should only be possible by manually destroying it (e.g. via
82//     `ffx component stop ...`), which never occurs in prod [0].
83//
84// Regardless, it should be safe to call this function multiple times per boot because if the
85// component has exited cleanly then commit must not be necessary (either it was already unnecessary
86// when it was called the first time or the first call successfully committed) and calling this
87// function when commit is not necessary is safe.
88//
89// Returns success if the component is idle or asked to stop.
90//
91// [0] https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/0110_reboot_for_critical_components?hl=en#detecting_termination_of_reboot-on-terminate_components
92async fn fresh_run() -> Result<(), Error> {
93    let reboot_deadline = std::time::Instant::now() + MINIMUM_REBOOT_WAIT;
94
95    let inspector = finspect::Inspector::default();
96    let inspect_controller =
97        inspect_runtime::publish(&inspector, inspect_runtime::PublishOptions::default());
98
99    let config = system_update_committer_config::Config::take_from_startup_handle();
100    let idle_timeout = if config.stop_on_idle_timeout_millis >= 0 {
101        zx::MonotonicDuration::from_millis(config.stop_on_idle_timeout_millis)
102    } else {
103        zx::MonotonicDuration::INFINITE
104    };
105    let commit_timeout = if config.commit_timeout_seconds >= 0 {
106        zx::MonotonicDuration::from_seconds(config.commit_timeout_seconds)
107    } else {
108        zx::MonotonicDuration::INFINITE
109    };
110    inspector
111        .root()
112        .record_child("structured_config", |config_node| config.record_inspect(config_node));
113
114    let verification_node = inspector.root().create_child("verification");
115    let commit_node = metadata::CommitInspect::new(inspector.root().create_child("commit"));
116    let mut health_node = finspect::health::Node::new(inspector.root());
117    let verification_node_ref = &verification_node;
118    let commit_node_ref = &commit_node;
119    let health_node_ref = &mut health_node;
120
121    let paver =
122        connect_to_protocol::<fpaver::PaverMarker>().context("while connecting to paver")?;
123    let (boot_manager, boot_manager_server_end) = ::fidl::endpoints::create_proxy();
124    paver
125        .find_boot_manager(boot_manager_server_end)
126        .context("transport error while calling find_boot_manager()")?;
127    let reboot_proxy =
128        connect_to_protocol::<fidl_fuchsia_hardware_power_statecontrol::AdminMarker>()
129            .context("while connecting to power state control")?;
130
131    let health_verification = connect_to_protocol::<HealthVerificationMarker>()
132        .context("while connecting to health verification")?;
133
134    let (p_internal, p_external) = zx::EventPair::create();
135    let p_internal_clone =
136        p_internal.duplicate_handle(zx::Rights::BASIC).context("while duplicating p_internal")?;
137
138    let (unblocker, blocker) = oneshot::channel();
139
140    // Handle putting boot metadata in happy state, rebooting on failure (if necessary), and
141    // reporting health to the inspect health node.
142    let commit_fut = async move {
143        match metadata::put_metadata_in_happy_state(
144            &boot_manager,
145            &p_internal,
146            unblocker,
147            &health_verification,
148            commit_timeout,
149            verification_node_ref,
150            commit_node_ref,
151        )
152        .await
153        {
154            Err(e) => {
155                let msg = format!(
156                    "Failed to commit system. Rebooting at {:?} given error {:#} and {:?}",
157                    reboot_deadline,
158                    anyhow!(e),
159                    config
160                );
161                health_node_ref.set_unhealthy(&msg);
162                warn!("{msg}");
163                reboot::wait_and_reboot(fasync::Timer::new(reboot_deadline), &reboot_proxy).await;
164            }
165            Ok(commit_result) => {
166                info!("{}", commit_result.log_msg());
167                health_node_ref.set_ok();
168            }
169        }
170    }
171    .fuse();
172    let mut commit_fut = std::pin::pin!(commit_fut);
173
174    let p_external_clone =
175        p_external.duplicate_handle(zx::Rights::BASIC).context("while duplicating p_external")?;
176    let fidl_server = Arc::new(fidl::FuchsiaUpdateFidlServer::new(
177        p_external_clone,
178        blocker.map_err(|e| e.to_string()),
179        idle_timeout,
180    ));
181    let mut fs = ServiceFs::new_local();
182    fs.take_and_serve_directory_handle()
183        .context("while taking directory handle")?
184        .dir("svc")
185        .add_fidl_service(Services::CommitStatusProvider);
186    let fs = fs.until_stalled(idle_timeout);
187    let active_guard = fs.active_guard();
188
189    let mut service_fut = async move {
190        let out_dir = fuchsia_sync::Mutex::new(None);
191        let () = fs
192            .for_each_concurrent(None, |item| async {
193                use fuchsia_component::server::Item;
194                match item {
195                    Item::Request(Services::CommitStatusProvider(stream), _active_guard) => {
196                        let () = fidl_server
197                            .clone()
198                            .handle_commit_status_provider_stream(stream)
199                            .await
200                            .unwrap_or_else(|e| {
201                                warn!("handling CommitStatusProviderStream {e:#}");
202                            });
203                    }
204                    Item::Stalled(outgoing_dir) => {
205                        *out_dir.lock() = Some(outgoing_dir);
206                    }
207                }
208            })
209            .await;
210        let out_dir = out_dir
211            .lock()
212            .take()
213            .expect("StallableServiceFs should return the out dir before ending");
214        Ok({
215            let frozen_inspect = if let Some(inspect_controller) = inspect_controller {
216                Some(
217                    inspect_controller
218                        .escrow_frozen(inspect_runtime::EscrowOptions::default())
219                        .await
220                        .context("freezing inspect")?
221                        .token
222                        .into_handle(),
223                )
224            } else {
225                None
226            };
227            (
228                EscrowState {
229                    p_internal: p_internal_clone.into(),
230                    p_external: p_external.into(),
231                    frozen_inspect,
232                },
233                out_dir.into(),
234            )
235        })
236    }
237    .boxed_local()
238    .fuse();
239    let service_fut = futures::select! {
240        () = commit_fut => {
241            // Keep serving the out dir until the device is committed to avoid the confusing
242            // situation in which the component is running but new clients are being ignored.
243            drop(active_guard);
244            service_fut
245        },
246        _ = service_fut => {
247            panic!("fidl service fut completed before commit fut. this should be impossible \
248            because of the active guard");
249        }
250    };
251
252    run_until_idle_or_component_stopped(service_fut).await
253}
254
255// Returns success if the component is idle or asked to stop.
256async fn resume_from_escrow(escrowed_state: fsandbox::DictionaryRef) -> Result<(), Error> {
257    let EscrowState { p_internal, p_external, frozen_inspect } =
258        EscrowState::load(escrowed_state).await.context("loading escrowed state")?;
259
260    let config = system_update_committer_config::Config::take_from_startup_handle();
261    let idle_timeout = if config.stop_on_idle_timeout_millis >= 0 {
262        zx::MonotonicDuration::from_millis(config.stop_on_idle_timeout_millis)
263    } else {
264        zx::MonotonicDuration::INFINITE
265    };
266
267    let p_external_clone =
268        p_external.duplicate_handle(zx::Rights::BASIC).context("while duplicating p_external")?;
269    let fidl_server = Arc::new(fidl::FuchsiaUpdateFidlServer::new(
270        p_external_clone.into(),
271        futures::future::ready(Ok(())),
272        idle_timeout,
273    ));
274    let mut fs = ServiceFs::new_local();
275    fs.take_and_serve_directory_handle()
276        .context("while taking directory handle")?
277        .dir("svc")
278        .add_fidl_service(Services::CommitStatusProvider);
279    let fs = fs.until_stalled(idle_timeout);
280
281    let service_fut = async move {
282        let out_dir = fuchsia_sync::Mutex::new(None);
283        let () = fs
284            .for_each_concurrent(None, |item| async {
285                use fuchsia_component::server::Item;
286                match item {
287                    Item::Request(Services::CommitStatusProvider(stream), _active_guard) => {
288                        let () = fidl_server
289                            .clone()
290                            .handle_commit_status_provider_stream(stream)
291                            .await
292                            .unwrap_or_else(|e| {
293                                warn!("handling CommitStatusProviderStream {e:#}");
294                            });
295                    }
296                    Item::Stalled(outgoing_dir) => {
297                        *out_dir.lock() = Some(outgoing_dir);
298                    }
299                }
300            })
301            .await;
302        let out_dir = out_dir
303            .lock()
304            .take()
305            .expect("StallableServiceFs should return the out dir before ending");
306        Ok((EscrowState { p_internal, p_external, frozen_inspect }, out_dir.into()))
307    }
308    .boxed_local()
309    .fuse();
310    run_until_idle_or_component_stopped(service_fut).await
311}
312
313// Runs service_fut until it completes or a Lifecycle.Stop message is received, then escrows state
314// if possible.
315#[allow(clippy::type_complexity)]
316async fn run_until_idle_or_component_stopped(
317    mut service_fut: futures::future::Fuse<
318        futures::future::LocalBoxFuture<
319            '_,
320            Result<(EscrowState, ::fidl::endpoints::ServerEnd<fio::DirectoryMarker>), Error>,
321        >,
322    >,
323) -> Result<(), Error> {
324    // We have ignored fuchsia.process/Lifecycle.Stop messages until now so that `fresh_run` can try
325    // to commit the system without interruption.
326    // Components have 5 seconds [0] to respond to Stop messages before being terminated by CM. We
327    // use the entire period to maximize the chance we commit the system if it is pending and passes
328    // the checks to avoid unnecessarily spending one of the seven boot attempts before rollback.
329    // [0] https://cs.opensource.google/fuchsia/fuchsia/+/main:src/sys/component_manager/src/model/environment.rs;l=31;drc=2f83da829133fd5432e7d9a3aeb4f46750f8572e
330    let lifecycle = fuchsia_runtime::take_startup_handle(fuchsia_runtime::HandleInfo::new(
331        fuchsia_runtime::HandleType::Lifecycle,
332        0,
333    ))
334    .context("taking lifecycle handle")?;
335    let lifecycle: ::fidl::endpoints::ServerEnd<flifecycle::LifecycleMarker> = lifecycle.into();
336    let (mut lifecycle_request_stream, lifecycle_controller) =
337        lifecycle.into_stream_and_control_handle();
338
339    futures::select! {
340        res = service_fut => {
341            let (state, out_dir) = res?;
342            let () = lifecycle_controller
343                .send_on_escrow(flifecycle::LifecycleOnEscrowRequest {
344                    outgoing_dir: Some(out_dir),
345                    escrowed_dictionary: Some(state.store().await.context("escrowing state")?),
346                    ..Default::default()
347                })
348                .context("escrowing component")?;
349            Ok(())
350        },
351        req = lifecycle_request_stream.next() => {
352            match req.ok_or_else(|| anyhow::anyhow!("LifecycleRequest stream closed unexpectedly"))?
353            .context("error reading from LifecycleRequest stream")?
354                {
355                flifecycle::LifecycleRequest::Stop{ control_handle} => {
356                    // TODO(https://fxbug.dev/332341289) Exit cleanly by escrowing state including
357                    // any client connections.
358                    info!(
359                        "received flifecycle::LifecycleRequest::Stop. Any client connections will \
360                         be closed. This should only happen during shutdown."
361                    );
362                    // The shutdown request is acknowledged by closing the lifecycle channel which
363                    // causes CM to kill the component.
364                    // Intentionally leak the channel so that it will be closed when the OS cleans
365                    // up the process, allowing the rest of the component's own cleanup to occur.
366
367                    // Drop these so the Arc<Channel> in the request stream can be unwrapped.
368                    drop((control_handle, lifecycle_controller));
369                    // Leak the internal channel instead of the RequestStream because the Fuchsia
370                    // executor will panic if it is dropped while registered receivers are still
371                    // alive.
372                    let (inner, _terminated): (_, bool) = lifecycle_request_stream.into_inner();
373                    let inner = std::sync::Arc::try_unwrap(inner).map_err(
374                        |_: std::sync::Arc<_>| {
375                            anyhow::anyhow!("failed to extract lifecycle channel from Arc")
376                        },
377                    )?;
378                    let inner: zx::Channel = inner.into_channel().into_zx_channel();
379                    std::mem::forget(inner);
380                    Ok(())
381                }
382            }
383        }
384    }
385}
386
387impl EscrowState {
388    const INTERNAL_EVENTPAIR: &'static str = "p_internal";
389    const EXTERNAL_EVENTPAIR: &'static str = "p_external";
390    const INSPECT: &'static str = "frozen_inspect";
391
392    async fn load(dict: fsandbox::DictionaryRef) -> Result<Self, Error> {
393        let store =
394            fuchsia_component::client::connect_to_protocol::<fsandbox::CapabilityStoreMarker>()?;
395        let id_generator = sandbox::CapabilityIdGenerator::new();
396
397        let dict_id = id_generator.next();
398        let () = store
399            .import(dict_id, fsandbox::Capability::Dictionary(dict))
400            .await?
401            .map_err(|e| anyhow!("{e:?}"))?;
402
403        let remove_from_dict = |key: &'static str| async {
404            let id = id_generator.next();
405            match store
406                .dictionary_remove(dict_id, key, Some(&fsandbox::WrappedNewCapabilityId { id }))
407                .await?
408            {
409                Ok(()) => {
410                    let fsandbox::Capability::Handle(handle) =
411                        store.export(id).await?.map_err(|e| anyhow!("{e:?}"))?
412                    else {
413                        anyhow::bail!("Bad capability type from dictionary");
414                    };
415                    Ok(Some(handle))
416                }
417                Err(fsandbox::CapabilityStoreError::ItemNotFound) => Ok(None),
418                Err(e) => {
419                    anyhow::bail!("exporting frozen inspect {e:?}");
420                }
421            }
422        };
423
424        let p_internal = remove_from_dict(Self::INTERNAL_EVENTPAIR)
425            .await?
426            .ok_or_else(|| anyhow!("internal eventpair missing from escrow"))?;
427        let p_external = remove_from_dict(Self::EXTERNAL_EVENTPAIR)
428            .await?
429            .ok_or_else(|| anyhow!("external eventpair missing from escrow"))?;
430        let frozen_inspect = remove_from_dict(Self::INSPECT).await?;
431
432        // TODO(https://fxbug.dev/383161492) Using the unescrowed inspect token, unescrow and
433        // republish the inspect data itself (using fuchsia.inspect/InspectSink.FetchEscrow) so that
434        // the component appears to be running if an inspect snapshot is taken (i.e. so that
435        // escrowed=false) in the inspect snapshot).
436        Ok(Self { p_internal, p_external, frozen_inspect })
437    }
438
439    async fn store(self) -> Result<fsandbox::DictionaryRef, Error> {
440        let Self { p_internal, p_external, frozen_inspect } = self;
441        let store =
442            fuchsia_component::client::connect_to_protocol::<fsandbox::CapabilityStoreMarker>()?;
443        let id_generator = sandbox::CapabilityIdGenerator::new();
444        let dict_id = id_generator.next();
445        let () = store.dictionary_create(dict_id).await?.map_err(|e| anyhow!("{e:?}"))?;
446
447        let insert_in_dict = |handle, key| async {
448            let id = id_generator.next();
449            let () = store
450                .import(id, fsandbox::Capability::Handle(handle))
451                .await?
452                .map_err(|e| anyhow!("{e:?}"))?;
453            let () = store
454                .dictionary_insert(dict_id, &fsandbox::DictionaryItem { key, value: id })
455                .await?
456                .map_err(|e| anyhow!("{e:?}"))?;
457            Result::<_, anyhow::Error>::Ok(())
458        };
459
460        if let Some(frozen_inspect) = frozen_inspect {
461            let () = insert_in_dict(frozen_inspect, Self::INSPECT.into()).await?;
462        }
463        let () = insert_in_dict(p_internal, Self::INTERNAL_EVENTPAIR.into()).await?;
464        let () = insert_in_dict(p_external, Self::EXTERNAL_EVENTPAIR.into()).await?;
465
466        let fsandbox::Capability::Dictionary(dictionary_ref) =
467            store.export(dict_id).await?.map_err(|e| anyhow!("{e:?}"))?
468        else {
469            anyhow::bail!("Bad capability type from dictionary");
470        };
471        Ok(dictionary_ref)
472    }
473}
474
475enum Services {
476    CommitStatusProvider(fupdate::CommitStatusProviderRequestStream),
477}
478
479#[cfg(test)]
480mod tests {
481    use super::*;
482
483    #[fasync::run_singlethreaded(test)]
484    async fn escrow_state_round_trip() {
485        let (p_internal, p_external) = zx::EventPair::create();
486        let frozen_inspect = Some(zx::Event::create().into());
487
488        let state = EscrowState {
489            p_internal: p_internal.into(),
490            p_external: p_external.into(),
491            frozen_inspect,
492        };
493
494        let stored = state.store().await.unwrap();
495        let loaded = EscrowState::load(stored).await.unwrap();
496        assert!(loaded.frozen_inspect.is_some());
497    }
498
499    #[fasync::run_singlethreaded(test)]
500    async fn escrow_state_round_trip_missing_inspect() {
501        let (p_internal, p_external) = zx::EventPair::create();
502
503        let state = EscrowState {
504            p_internal: p_internal.into(),
505            p_external: p_external.into(),
506            frozen_inspect: None,
507        };
508
509        let stored = state.store().await.unwrap();
510        let loaded = EscrowState::load(stored).await.unwrap();
511        assert_eq!(loaded.frozen_inspect, None);
512    }
513}