Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 3 additions & 17 deletions crates/scout/src/deprovision/scrabbing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,12 @@ use carbide_uuid::machine::MachineId;
use regex::Regex;
use scout::CarbideClientError;
use serde::Deserialize;
use smbioslib::SMBiosSystemInformation;
use tracing::Instrument;

use crate::cfg::Options;
use crate::client::create_forge_client;
use crate::deprovision::cmdrun;
use crate::{CarbideClientResult, IN_QEMU_VM};
use crate::{CarbideClientResult, IN_QEMU_VM, platform};

fn check_memory_overwrite_efi_var() -> Result<(), CarbideClientError> {
let name = match efivar::efi::Variable::from_str(
Expand Down Expand Up @@ -1093,22 +1092,9 @@ async fn do_cleanup(machine_id: &MachineId) -> CarbideClientResult<rpc::MachineC
Ok(cleanup_result)
}

fn is_host() -> bool {
match smbioslib::table_load_from_device() {
Ok(data) => data.any(|sys_info: SMBiosSystemInformation| {
!sys_info
.product_name()
.to_string()
.to_lowercase()
.contains("bluefield")
}),
Err(_err) => true,
}
}

pub(crate) async fn run(config: &Options, machine_id: &MachineId) -> CarbideClientResult<()> {
tracing::info!("full deprovision starts.");
if !is_host() {
if !platform::is_host() {
tracing::info!("full deprovision skipped, we are not running on a host.");
// do not send API cleanup_machine_completed
return Ok(());
Expand All @@ -1122,7 +1108,7 @@ pub(crate) async fn run(config: &Options, machine_id: &MachineId) -> CarbideClie
}

pub async fn run_no_api(tpm_path: &str) -> Result<(), CarbideClientError> {
if !is_host() {
if !platform::is_host() {
tracing::info!("No cleanup needed on DPU.");
return Ok(());
}
Expand Down
1 change: 1 addition & 0 deletions crates/scout/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ mod discovery;
mod firmware_upgrade;
mod machine_validation;
mod mlx_device;
mod platform;
mod register;
mod stream;
mod tpm;
Expand Down
69 changes: 69 additions & 0 deletions crates/scout/src/platform.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

use smbioslib::{SMBiosSystemInformation, table_load_from_device};

pub(crate) fn is_host_from_product_names<'a>(
product_names: impl IntoIterator<Item = &'a str>,
) -> bool {
!product_names
.into_iter()
.any(|name| name.to_ascii_lowercase().contains("bluefield"))
}

/// Returns `true` when scout is running on a managed host (as opposed to a DPU).
pub(crate) fn is_host() -> bool {
match table_load_from_device() {
Ok(data) => {
let product_names = data
.map(|sys_info: SMBiosSystemInformation| sys_info.product_name().to_string())
.collect::<Vec<_>>();
is_host_from_product_names(product_names.iter().map(String::as_str))
}
Err(_err) => true,
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn is_host_from_product_names_cases() {
let cases: &[(&[&str], bool)] = &[
(&["DGX H100"], true),
(&["BlueField-3 DPU"], false),
(&["NVIDIA Bluefield 2"], false),
(&[""], true),
(&["DGX H100", "BlueField-3 DPU"], false),
(&["DGX H100", "Other Platform"], true),
];

for (product_names, want_host) in cases {
assert_eq!(
is_host_from_product_names(product_names.iter().copied()),
*want_host,
"product_names={product_names:?}"
);
}
}

#[test]
fn is_host_returns_bool_without_panicking() {
let _ = is_host();
}
}
20 changes: 15 additions & 5 deletions crates/scout/src/register.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use tracing::info;
use tss_esapi::Context;
use tss_esapi::handles::KeyHandle;

use crate::{CarbideClientError, attestation as attest};
use crate::{CarbideClientError, attestation as attest, platform, tpm};

pub async fn run(
forge_api: &str,
Expand All @@ -36,7 +36,9 @@ pub async fn run(
let mut hardware_info = enumerate_hardware()?;
info!("Successfully enumerated hardware");

let is_dpu = hardware_info.tpm_ek_certificate.is_none();
// Missing TPM EK material must not be treated as DPU detection. DPUs are
// identified from platform SMBIOS data, not from TPM availability.
let is_dpu = !platform::is_host();

if machine_interface_id.is_none() && !is_dpu {
return Err(CarbideClientError::GenericError(
Expand All @@ -61,9 +63,17 @@ pub async fn run(
// CHANGETO - supply context externally
hardware_info.tpm_description = attest::get_tpm_description(&mut tss_ctx);

let result = attest::create_attest_key_info(&mut tss_ctx).map_err(|e| {
CarbideClientError::TpmError(format!("Could not create AttestKeyInfo: {e}"))
})?;
let result = match attest::create_attest_key_info(&mut tss_ctx) {
Ok(result) => result,
Err(e) => {
if tpm::should_attempt_tpm_recovery_for_attest_key_failure(&*e) {
tpm::recover_tpm_and_reboot(tpm_path)?;
}
return Err(CarbideClientError::TpmError(format!(
"Could not create AttestKeyInfo: {e}"
)));
}
};

hardware_info.attest_key_info = Some(result.0);
endorsement_key_handle_opt = Some(result.1);
Expand Down
84 changes: 84 additions & 0 deletions crates/scout/src/tpm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,18 @@
* limitations under the License.
*/

use std::fs::{self, OpenOptions};
use std::io::{ErrorKind, Write};
use std::path::Path;
use std::process::Command;

use tss_esapi::handles::AuthHandle;
use tss_esapi::interface_types::session_handles::AuthSession;

use crate::{CarbideClientError, attestation as attest};

pub(crate) const TPM_RECOVERY_ATTEMPTED_PATH: &str = "/run/scout/tpm_recovery_reboot_attempted";

// From https://superuser.com/questions/1404738/tpm-2-0-hardware-error-da-lockout-mode
pub(crate) fn set_tpm_max_auth_fail() -> Result<(), CarbideClientError> {
let output = Command::new("tpm2_dictionarylockout")
Expand Down Expand Up @@ -81,3 +86,82 @@ pub(crate) fn clear_tpm(tpm_path: &str) -> Result<(), CarbideClientError> {
tracing::info!("TPM lockout hierarchy clear completed");
Ok(())
}

/// Returns true when attestation-key setup failed after a TPM context was opened successfully.
///
/// Recovery is only attempted for this stage: context creation failures (bad path, missing device)
/// are not recoverable via TPM clear.
pub(crate) fn should_attempt_tpm_recovery_for_attest_key_failure(
source: &dyn std::error::Error,
) -> bool {
let message = source.to_string().to_ascii_lowercase();
!message.contains("not supported")
}

fn claim_tpm_recovery_attempt() -> Result<(), CarbideClientError> {
if let Some(parent) = Path::new(TPM_RECOVERY_ATTEMPTED_PATH).parent() {
fs::create_dir_all(parent).map_err(CarbideClientError::StdIo)?;
}

let mut marker = match OpenOptions::new()
.write(true)
.create_new(true)
.open(TPM_RECOVERY_ATTEMPTED_PATH)
{
Ok(file) => file,
Err(e) if e.kind() == ErrorKind::AlreadyExists => {
return Err(CarbideClientError::TpmError(
"TPM recovery was already attempted this boot cycle; refusing to loop".to_string(),
));
}
Err(e) => return Err(CarbideClientError::StdIo(e)),
};
marker
.write_all(b"tpm recovery reboot requested\n")
.map_err(CarbideClientError::StdIo)
}

/// Clears the TPM and reboots the host once per boot cycle to recover from missing TPM material.
pub(crate) fn recover_tpm_and_reboot(tpm_path: &str) -> Result<(), CarbideClientError> {
claim_tpm_recovery_attempt()?;

tracing::warn!("Attempting automated TPM clear and reboot to recover attestation state");
clear_tpm(tpm_path)?;

let output = Command::new("systemctl")
.arg("reboot")
.output()
.map_err(CarbideClientError::StdIo)?;
if !output.status.success() {
return Err(CarbideClientError::GenericError(format!(
"systemctl reboot failed with status {:?}: {}",
output.status.code(),
String::from_utf8_lossy(&output.stderr)
)));
}

Ok(())
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn attest_key_failure_recovery_classification_cases() {
let cases: &[(&str, bool)] = &[
("handle already exists", true),
("tpm corruption detected", true),
("feature not supported on this device", false),
];

for (message, want_recovery) in cases {
let err: Box<dyn std::error::Error> = Box::new(std::io::Error::other(*message));
assert_eq!(
should_attempt_tpm_recovery_for_attest_key_failure(&*err),
*want_recovery,
"message={message:?}"
);
}
}
}
Loading