diff --git a/crates/scout/src/deprovision/scrabbing.rs b/crates/scout/src/deprovision/scrabbing.rs index c9a482dc45..e38261a10d 100644 --- a/crates/scout/src/deprovision/scrabbing.rs +++ b/crates/scout/src/deprovision/scrabbing.rs @@ -23,13 +23,12 @@ use carbide_uuid::machine::MachineId; use regex::Regex; use scout::CarbideClientError; use serde::Deserialize; -use smbioslib::SMBiosSystemInformation; use tracing::Instrument; use crate::cfg::Options; use crate::client::create_forge_client; use crate::deprovision::cmdrun; -use crate::{CarbideClientResult, IN_QEMU_VM}; +use crate::{CarbideClientResult, IN_QEMU_VM, platform}; fn check_memory_overwrite_efi_var() -> Result<(), CarbideClientError> { let name = match efivar::efi::Variable::from_str( @@ -1093,22 +1092,9 @@ async fn do_cleanup(machine_id: &MachineId) -> CarbideClientResult bool { - match smbioslib::table_load_from_device() { - Ok(data) => data.any(|sys_info: SMBiosSystemInformation| { - !sys_info - .product_name() - .to_string() - .to_lowercase() - .contains("bluefield") - }), - Err(_err) => true, - } -} - pub(crate) async fn run(config: &Options, machine_id: &MachineId) -> CarbideClientResult<()> { tracing::info!("full deprovision starts."); - if !is_host() { + if !platform::is_host() { tracing::info!("full deprovision skipped, we are not running on a host."); // do not send API cleanup_machine_completed return Ok(()); @@ -1122,7 +1108,7 @@ pub(crate) async fn run(config: &Options, machine_id: &MachineId) -> CarbideClie } pub async fn run_no_api(tpm_path: &str) -> Result<(), CarbideClientError> { - if !is_host() { + if !platform::is_host() { tracing::info!("No cleanup needed on DPU."); return Ok(()); } diff --git a/crates/scout/src/main.rs b/crates/scout/src/main.rs index 695b2563d5..10c334e89e 100644 --- a/crates/scout/src/main.rs +++ b/crates/scout/src/main.rs @@ -55,6 +55,7 @@ mod discovery; mod firmware_upgrade; mod machine_validation; mod mlx_device; +mod platform; mod register; mod stream; mod tpm; diff --git a/crates/scout/src/platform.rs b/crates/scout/src/platform.rs new file mode 100644 index 0000000000..8af08c58b2 --- /dev/null +++ b/crates/scout/src/platform.rs @@ -0,0 +1,69 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use smbioslib::{SMBiosSystemInformation, table_load_from_device}; + +pub(crate) fn is_host_from_product_names<'a>( + product_names: impl IntoIterator, +) -> bool { + !product_names + .into_iter() + .any(|name| name.to_ascii_lowercase().contains("bluefield")) +} + +/// Returns `true` when scout is running on a managed host (as opposed to a DPU). +pub(crate) fn is_host() -> bool { + match table_load_from_device() { + Ok(data) => { + let product_names = data + .map(|sys_info: SMBiosSystemInformation| sys_info.product_name().to_string()) + .collect::>(); + is_host_from_product_names(product_names.iter().map(String::as_str)) + } + Err(_err) => true, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn is_host_from_product_names_cases() { + let cases: &[(&[&str], bool)] = &[ + (&["DGX H100"], true), + (&["BlueField-3 DPU"], false), + (&["NVIDIA Bluefield 2"], false), + (&[""], true), + (&["DGX H100", "BlueField-3 DPU"], false), + (&["DGX H100", "Other Platform"], true), + ]; + + for (product_names, want_host) in cases { + assert_eq!( + is_host_from_product_names(product_names.iter().copied()), + *want_host, + "product_names={product_names:?}" + ); + } + } + + #[test] + fn is_host_returns_bool_without_panicking() { + let _ = is_host(); + } +} diff --git a/crates/scout/src/register.rs b/crates/scout/src/register.rs index e02f23b91b..7474663244 100644 --- a/crates/scout/src/register.rs +++ b/crates/scout/src/register.rs @@ -24,7 +24,7 @@ use tracing::info; use tss_esapi::Context; use tss_esapi::handles::KeyHandle; -use crate::{CarbideClientError, attestation as attest}; +use crate::{CarbideClientError, attestation as attest, platform, tpm}; pub async fn run( forge_api: &str, @@ -36,7 +36,9 @@ pub async fn run( let mut hardware_info = enumerate_hardware()?; info!("Successfully enumerated hardware"); - let is_dpu = hardware_info.tpm_ek_certificate.is_none(); + // Missing TPM EK material must not be treated as DPU detection. DPUs are + // identified from platform SMBIOS data, not from TPM availability. + let is_dpu = !platform::is_host(); if machine_interface_id.is_none() && !is_dpu { return Err(CarbideClientError::GenericError( @@ -61,9 +63,17 @@ pub async fn run( // CHANGETO - supply context externally hardware_info.tpm_description = attest::get_tpm_description(&mut tss_ctx); - let result = attest::create_attest_key_info(&mut tss_ctx).map_err(|e| { - CarbideClientError::TpmError(format!("Could not create AttestKeyInfo: {e}")) - })?; + let result = match attest::create_attest_key_info(&mut tss_ctx) { + Ok(result) => result, + Err(e) => { + if tpm::should_attempt_tpm_recovery_for_attest_key_failure(&*e) { + tpm::recover_tpm_and_reboot(tpm_path)?; + } + return Err(CarbideClientError::TpmError(format!( + "Could not create AttestKeyInfo: {e}" + ))); + } + }; hardware_info.attest_key_info = Some(result.0); endorsement_key_handle_opt = Some(result.1); diff --git a/crates/scout/src/tpm.rs b/crates/scout/src/tpm.rs index 46ba4c49d1..2d23bcf0db 100644 --- a/crates/scout/src/tpm.rs +++ b/crates/scout/src/tpm.rs @@ -15,6 +15,9 @@ * limitations under the License. */ +use std::fs::{self, OpenOptions}; +use std::io::{ErrorKind, Write}; +use std::path::Path; use std::process::Command; use tss_esapi::handles::AuthHandle; @@ -22,6 +25,8 @@ use tss_esapi::interface_types::session_handles::AuthSession; use crate::{CarbideClientError, attestation as attest}; +pub(crate) const TPM_RECOVERY_ATTEMPTED_PATH: &str = "/run/scout/tpm_recovery_reboot_attempted"; + // From https://superuser.com/questions/1404738/tpm-2-0-hardware-error-da-lockout-mode pub(crate) fn set_tpm_max_auth_fail() -> Result<(), CarbideClientError> { let output = Command::new("tpm2_dictionarylockout") @@ -81,3 +86,82 @@ pub(crate) fn clear_tpm(tpm_path: &str) -> Result<(), CarbideClientError> { tracing::info!("TPM lockout hierarchy clear completed"); Ok(()) } + +/// Returns true when attestation-key setup failed after a TPM context was opened successfully. +/// +/// Recovery is only attempted for this stage: context creation failures (bad path, missing device) +/// are not recoverable via TPM clear. +pub(crate) fn should_attempt_tpm_recovery_for_attest_key_failure( + source: &dyn std::error::Error, +) -> bool { + let message = source.to_string().to_ascii_lowercase(); + !message.contains("not supported") +} + +fn claim_tpm_recovery_attempt() -> Result<(), CarbideClientError> { + if let Some(parent) = Path::new(TPM_RECOVERY_ATTEMPTED_PATH).parent() { + fs::create_dir_all(parent).map_err(CarbideClientError::StdIo)?; + } + + let mut marker = match OpenOptions::new() + .write(true) + .create_new(true) + .open(TPM_RECOVERY_ATTEMPTED_PATH) + { + Ok(file) => file, + Err(e) if e.kind() == ErrorKind::AlreadyExists => { + return Err(CarbideClientError::TpmError( + "TPM recovery was already attempted this boot cycle; refusing to loop".to_string(), + )); + } + Err(e) => return Err(CarbideClientError::StdIo(e)), + }; + marker + .write_all(b"tpm recovery reboot requested\n") + .map_err(CarbideClientError::StdIo) +} + +/// Clears the TPM and reboots the host once per boot cycle to recover from missing TPM material. +pub(crate) fn recover_tpm_and_reboot(tpm_path: &str) -> Result<(), CarbideClientError> { + claim_tpm_recovery_attempt()?; + + tracing::warn!("Attempting automated TPM clear and reboot to recover attestation state"); + clear_tpm(tpm_path)?; + + let output = Command::new("systemctl") + .arg("reboot") + .output() + .map_err(CarbideClientError::StdIo)?; + if !output.status.success() { + return Err(CarbideClientError::GenericError(format!( + "systemctl reboot failed with status {:?}: {}", + output.status.code(), + String::from_utf8_lossy(&output.stderr) + ))); + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn attest_key_failure_recovery_classification_cases() { + let cases: &[(&str, bool)] = &[ + ("handle already exists", true), + ("tpm corruption detected", true), + ("feature not supported on this device", false), + ]; + + for (message, want_recovery) in cases { + let err: Box = Box::new(std::io::Error::other(*message)); + assert_eq!( + should_attempt_tpm_recovery_for_attest_key_failure(&*err), + *want_recovery, + "message={message:?}" + ); + } + } +}