diff --git a/AGENTS.md b/AGENTS.md index f184d52..c62b3f3 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -83,9 +83,10 @@ ruff format - McCabe complexity ≤ 10; refactor if exceeded. - Cognitive complexity ≤ 15; refactor if exceeded. - Remove unused imports and trailing whitespace. -- Max line length = 88 +- Max line length = 88; Try to be within 80. - Stick with ASCII characters in source code; Only use non-ASCII when native human language scripts provides clearer message. +- Avoid ambiguous variable name (E741) ## File headers @@ -165,7 +166,7 @@ Consistent fields: project name, version, author/contributor names, license, des - Metadata as YAML front matter between triple-dashed lines (Hugo/Jekyll style). - Standard Markdown; avoid GitHub-specific extensions. - `sentence case` for headings/titles. -- Max line length = 80 +- Max line length = 80; Except diagram, table, and URLs. - Run Markdownlint. ## HTML and CSS diff --git a/README.md b/README.md index 8a7a56b..1b08e40 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ Install extras to enable metadata extraction from model files: ```bash pip install -e ".[aimodel]" # all supported local AI model formats -pip install -e ".[huggingface]" # HuggingFace Hub model metadata +pip install -e ".[huggingface]" # Hugging Face Hub model metadata ``` or choose individual local formats: @@ -113,9 +113,9 @@ loom -m path/to/model.gguf Supported local formats: GGUF, ONNX, Safetensors, PyTorch (`.pt`/`.pth`), Keras, HDF5, NumPy, fastText. -#### HuggingFace model SBOM +#### Hugging Face model SBOM -Pass a HuggingFace URL or model ID directly - no local file required. +Pass a Hugging Face URL or model ID directly - no local file required. Pitloom fetches metadata from the Hub (model card, `config.json`, `tokenizer_config.json`, and `generation_config.json`) and produces an enriched `ai_AIPackage` SBOM with architecture, hyperparameters, license, @@ -194,7 +194,7 @@ generate_ai_model_sbom( pretty=True, ) -# Generate an SBOM from a HuggingFace model repository (no local file needed) +# Generate an SBOM from a Hugging Face model repository (no local file needed) from pitloom.assemble import generate_huggingface_sbom generate_huggingface_sbom( diff --git a/docs/implementation/license-pipeline.md b/docs/implementation/license-pipeline.md new file mode 100644 index 0000000..bbb8185 --- /dev/null +++ b/docs/implementation/license-pipeline.md @@ -0,0 +1,294 @@ +--- +SPDX-FileCopyrightText: 2026-present Arthit Suriyawongkul +SPDX-FileType: DOCUMENTATION +SPDX-License-Identifier: CC0-1.0 +--- + +# License detection pipeline + +This document describes how Pitloom detects, carries, and exports +licence information from its various input sources into a finished +SPDX 3 SBOM document. + +## Overview + +Licence data flows through three distinct stages: + +1. **Extract** — one or more source-specific extractors read licence + information from files and remote APIs. +2. **Model** — extracted data is normalised into a format-neutral + intermediate representation (`ProjectMetadata` or `AiModelMetadata`). +3. **Assemble and export** — the assembler converts the intermediate model + into SPDX 3 elements and serialises them as JSON-LD. + +## Data flow diagram + +```text +Source inputs +---------------------------------------------------------------------- +pyproject.toml AI model file HuggingFace Hub repo +setup.cfg (PT2 extra/ (model card YAML) +CITATION.cff license) (LICENSE file + licenseid) +codemeta.json +LICENSE / LICENCE / + COPYING file + (+ licenseid) + | | | + v v v +---------------------------------------------------------------------- +EXTRACT LAYER (src/pitloom/extract/) +---------------------------------------------------------------------- +pyproject.py _pytorch_pt2.py _huggingface.py +setuptools.py (zip entry +---------------------------+ +poetry.py extra/license) | 1. card YAML license: | + | if vague/missing: | +_license.py ------------------------ | 2. _detect_license_ | + detect_license_for_project() | from_hf_files() | + |- pyproject.toml project.license | -> licenseid library | + |- CITATION.cff license: | (>=0.85 confidence)| + |- codemeta.json license: +---------------------------+ + +- LICENSE file (via licenseid) + | | | + v v v +---------------------------------------------------------------------- +FORMAT-NEUTRAL MODEL (src/pitloom/core/) +---------------------------------------------------------------------- +ProjectMetadata AiModelMetadata + .license_name: str | None .license: str | None + .provenance["license"]: str .provenance["license"]: str + | | + v v +---------------------------------------------------------------------- +ASSEMBLE LAYER (src/pitloom/assemble/spdx3/) +---------------------------------------------------------------------- +document.py build() document.py build_model() + main package / deps standalone AI model + | | + +---------------+---------------+ + | + ai.py add_ai_models() + deps.py build_license_elements() + |- reuse SimpleLicensingText if duplicate + +- else create simplelicensing_SimpleLicensingText + | + +----------+----------+ + v v + Relationship Relationship + hasDeclaredLicense hasConcludedLicense + (package -> license) (package -> license) + | + v +---------------------------------------------------------------------- +EXPORT LAYER (src/pitloom/export/spdx3_json.py) +---------------------------------------------------------------------- +Spdx3JsonExporter.to_json() + +- JSON-LD graph (@context + @graph) + |- simplelicensing_SimpleLicensingText + |- Relationship {relationshipType: hasDeclaredLicense} + +- Relationship {relationshipType: hasConcludedLicense} +``` + +## Stage 1: extract + +### Python project sources + +`src/pitloom/extract/pyproject.py` calls +`detect_license_for_project()` from `_license.py` after parsing +`pyproject.toml`. That function tries four sources in priority order: + +1. `project.license` in `pyproject.toml` (PEP 639 SPDX expression or + legacy text/file pointer). +2. `license:` scalar or list in `CITATION.cff`. +3. `license:` field in `codemeta.json` (URL values are reduced to their + SPDX ID segment). +4. Text content of `LICENSE`, `LICENCE`, `COPYING`, or `COPYRIGHT` + (with common suffixes) passed to `detect_license_from_text()` via + the `licenseid` library (≥ 0.85 confidence). + +`setuptools.py` and `poetry.py` follow the same pattern: they read +their respective `license` / `license_name` fields and store the result +in `ProjectMetadata.license_name`. + +All extractors record their source in `provenance["license"]` using the +`Source: … | Field: …` convention. + +### AI model file sources + +Only formats that embed metadata in the file itself can carry a licence: + +| Format | Extractor | Licence field | +| :---------- | :--------------------- | :-------------------------- | +| PyTorch PT2 | `_pytorch_pt2.py` | `extra/license` zip entry | +| GGUF | `_gguf.py` | not yet mapped | +| Safetensors | `_safetensors.py` | not yet mapped | +| ONNX | `_onnx.py` | not yet mapped | +| Others | various | not yet mapped | + +The `AiModelMetadata.license` field is `None` when no embedded licence +is found; the assembler handles this gracefully by emitting no licence +relationships. + +### HuggingFace Hub source + +`_huggingface.py` implements a two-step resolution in `_resolve_license()`: + +1. **Card YAML** — reads `license:` from the model card frontmatter. If + the value is not a vague sentinel (`other`, `custom`, `proprietary`, + `unknown`, `unlicensed`), it is accepted as-is and stored in + `AiModelMetadata.license`. +2. **File detection** — when the card YAML value is absent or vague, + `_detect_license_from_hf_files()` iterates through candidate files in + the repository (`LICENSE`, `LICENCE`, `COPYING`, `NOTICE`, and + suffixed variants) in priority order. Each file is downloaded via + `hf_hub_download` and its text is passed to `detect_license_from_text()` + from the `licenseid` library. The first match above the 0.85 confidence + threshold is accepted. The original vague card value is preserved in + `extra_data["hf.license_raw"]` for auditability. + +### `licenseid` dependency + +Text-based licence detection (`detect_license_from_text()` in +`_license.py`) relies on the optional `licenseid` package. When the +package is not installed or its database has not been built, detection +is silently skipped and the function returns `None`. To enable it: + +```shell +pip install pitloom[license] +licenseid update +``` + +The database is stored at +`~/.local/share/licenseid/licenses.db`. Detection uses cosine similarity +against vectorised licence texts with a default threshold of 0.85. + +## Stage 2: format-neutral model + +After extraction, licence data lives in one of two dataclasses: + +- `ProjectMetadata.license_name: str | None` — for Python projects. +- `AiModelMetadata.license: str | None` — for AI model files and + HuggingFace Hub models. + +Both carry a `provenance: dict[str, str]` where the `"license"` key +records a human-readable source description, for example: + +``` +Source: pyproject.toml | Field: project.license +Source: Hugging Face Hub | File: LICENSE | Method: licenseid_detection +Source: model.pt2 | Field: extra/license +``` + +## Stage 3: assemble and export + +### `build_license_elements()` — `assemble/spdx3/deps.py` + +This shared helper is called by every code path that needs to emit +licence relationships. It: + +1. Looks up `exporter.find_license(license_id)` to reuse an existing + `simplelicensing_SimpleLicensingText` element when the same licence + identifier has already been registered (avoids duplicates when + multiple packages share a licence). +2. If no match is found, creates a new + `simplelicensing_SimpleLicensingText` element with: + - `name`: first line of the identifier, truncated to 60 characters. + - `simplelicensing_licenseText`: the full licence identifier string. + - `comment`: `"Metadata provenance: license: "`. +3. Builds and returns two fresh `Relationship` elements: + - `hasDeclaredLicense` — the licence declared in the software + artefact itself. + - `hasConcludedLicense` — the licence as concluded by the SBOM + creator (currently set to the same value; see in-code comment for + planned refinement). + +The caller is responsible for adding both relationships to the exporter. + +### Call sites + +| Call site | Subject package | Trigger condition | +| :--- | :--- | :--- | +| `document.py build()` | main Python package | `metadata.license_name` is set | +| `document.py build()` | each dependency | via `_enrich_from_installed()` | +| `ai.py add_ai_models()` | each AI model | `ai_model.license` is set | +| `document.py build_model()` | standalone AI model | `model.license` is set | + +### `profileConformance` + +When any licence relationship is added, the assembler appends +`simpleLicensing` to `SpdxDocument.profileConformance`. For documents +that mix Python and AI content, the check is de-duplicated so the +profile identifier appears exactly once regardless of how many packages +carry a licence. + +### Output elements + +For each package with a known licence, the JSON-LD graph contains: + +```jsonc +{ + "type": "simplelicensing_SimpleLicensingText", + "spdxId": "https://spdx.org/spdxdocs/License/Apache-2.0-1-", + "name": "Apache-2.0", + "simplelicensing_licenseText": "Apache-2.0", + "comment": "Metadata provenance: license: Source: pyproject.toml | Field: project.license" +}, +{ + "type": "Relationship", + "spdxId": "https://spdx.org/spdxdocs/Relationship/hasDeclaredLicense1-", + "relationshipType": "hasDeclaredLicense", + "from": "https://spdx.org/spdxdocs/Package/mypackage-1-", + "to": ["https://spdx.org/spdxdocs/License/Apache-2.0-1-"] +}, +{ + "type": "Relationship", + "spdxId": "https://spdx.org/spdxdocs/Relationship/hasConcludedLicense2-", + "relationshipType": "hasConcludedLicense", + "from": "https://spdx.org/spdxdocs/Package/mypackage-1-", + "to": ["https://spdx.org/spdxdocs/License/Apache-2.0-1-"] +} +``` + +## Limitations and future work + +- `hasDeclaredLicense` and `hasConcludedLicense` currently point to the + same `SimpleLicensingText` element. The SPDX 3 specification allows + them to differ (e.g. when multiple declared licences must be concluded + as a conjunction). Separate handling is deferred to a future version. +- GGUF, Safetensors, ONNX, and most other model formats do not embed a + machine-readable licence field. Licence data for those models must come + from an external source such as HuggingFace Hub or a user-supplied + fragment. +- `licenseid` text detection is probabilistic (threshold 0.85). Unusual + licence texts or heavily modified standard licences may not be + detected. Always verify the concluded licence in the SBOM. + +## Related source files + +| File | Role | +| :--- | :--- | +| `src/pitloom/extract/_license.py` | `detect_license_from_text()`, + `find_license_files()`, `detect_license_for_project()` | +| `src/pitloom/extract/pyproject.py` | Python project licence + extraction and detection | +| `src/pitloom/extract/setuptools.py` | setuptools project licence + extraction | +| `src/pitloom/extract/poetry.py` | Poetry project licence extraction | +| `src/pitloom/extract/_huggingface.py` | HuggingFace Hub card YAML + and file-based detection | +| `src/pitloom/extract/_pytorch_pt2.py` | PT2 archive `extra/license` + entry | +| `src/pitloom/core/project.py` | `ProjectMetadata.license_name` + field | +| `src/pitloom/core/ai_metadata.py` | `AiModelMetadata.license` + field | +| `src/pitloom/assemble/spdx3/deps.py` | `build_license_elements()` + shared helper | +| `src/pitloom/assemble/spdx3/document.py` | `build()` and + `build_model()` — licence wiring | +| `src/pitloom/assemble/spdx3/ai.py` | `add_ai_models()` — AI model + licence wiring | +| `src/pitloom/export/spdx3_json.py` | `Spdx3JsonExporter.find_license()`, + `add_license()` | +| `tests/test_license.py` | Unit tests for `_license.py` utilities | +| `tests/test_generator.py` | End-to-end licence export tests with + fixture files | diff --git a/src/pitloom/__main__.py b/src/pitloom/__main__.py index 0888206..f5b1ee8 100644 --- a/src/pitloom/__main__.py +++ b/src/pitloom/__main__.py @@ -98,12 +98,12 @@ def _build_parser() -> argparse.ArgumentParser: default=None, metavar="MODEL_FILE_OR_HF_URL", help=( - "Path to a local AI model file, or a HuggingFace URL / model ID. " + "Path to a local AI model file, or a Hugging Face URL / model ID. " "Generate a standalone SBOM for the model as an AIPackage, " "without requiring a project directory. " "Local formats: GGUF, ONNX, Safetensors, PyTorch, " "Keras, HDF5, NumPy, fastText. " - "HuggingFace: full URL " + "Hugging Face: full URL " "(e.g. https://huggingface.co/mistralai/Mistral-7B-v0.1) " "or bare model ID (e.g. Qwen/Qwen3-235B-A22B)." ), @@ -521,7 +521,7 @@ def _resolve_model_output_path(explicit: Path | None, model_path: Path) -> Path: def _resolve_hf_output_path(explicit: Path | None, model_id: str) -> Path: - """Return the SBOM output path for a HuggingFace model SBOM. + """Return the SBOM output path for a Hugging Face model SBOM. Uses the explicit ``-o`` path when given; otherwise derives ``.spdx3.json`` from the model ID and writes it to the @@ -604,12 +604,12 @@ def _run_local_model_mode(args: argparse.Namespace, source: str) -> int: def _run_hf_model_mode(args: argparse.Namespace, source: str) -> int: - """Generate a standalone SBOM from a HuggingFace model repository.""" + """Generate a standalone SBOM from a Hugging Face model repository.""" try: model_id = parse_hf_model_id(source) if model_id is None: print( - f"Error: Not a valid HuggingFace URL or model ID: {source!r}", + f"Error: Not a valid Hugging Face URL or model ID: {source!r}", file=sys.stderr, ) return 1 @@ -626,9 +626,9 @@ def _run_hf_model_mode(args: argparse.Namespace, source: str) -> int: output_path = _resolve_hf_output_path(args.output, model_id) if args.verbose: - print(f"Pitloom version: {__version__}") - print(f"HuggingFace model : {model_id}") - print(f"Output path : {output_path}") + print(f"Pitloom version : {__version__}") + print(f"Hugging Face model : {model_id}") + print(f"Output path : {output_path}") generate_huggingface_sbom( model_id, @@ -640,7 +640,7 @@ def _run_hf_model_mode(args: argparse.Namespace, source: str) -> int: return 0 except Exception as e: # pylint: disable=broad-exception-caught - print(f"Error generating HuggingFace model SBOM: {e}", file=sys.stderr) + print(f"Error generating Hugging Face model SBOM: {e}", file=sys.stderr) traceback.print_exc() return 1 diff --git a/src/pitloom/assemble/__init__.py b/src/pitloom/assemble/__init__.py index 55a3de1..760e808 100644 --- a/src/pitloom/assemble/__init__.py +++ b/src/pitloom/assemble/__init__.py @@ -133,9 +133,9 @@ def generate_huggingface_sbom( pretty: bool = False, describe_relationship: bool = False, ) -> str: - """Generate a standalone SPDX 3 SBOM for a HuggingFace model repository. + """Generate a standalone SPDX 3 SBOM for a Hugging Face model repository. - Fetches metadata from the HuggingFace Hub (``config.json``, model card, + Fetches metadata from the Hugging Face Hub (``config.json``, model card, ``tokenizer_config.json``, etc.) and assembles an ``ai_AIPackage`` SBOM. No local model file is required. @@ -154,7 +154,7 @@ def generate_huggingface_sbom( Raises: ImportError: If ``huggingface_hub`` is not installed. - ValueError: If *model_source* is not a valid HuggingFace URL or model ID. + ValueError: If *model_source* is not a valid Hugging Face URL or model ID. """ model = read_huggingface(model_source) exporter = build_model(model, creation_info or CreationMetadata()) diff --git a/src/pitloom/assemble/spdx3/ai.py b/src/pitloom/assemble/spdx3/ai.py index 6377942..6623619 100644 --- a/src/pitloom/assemble/spdx3/ai.py +++ b/src/pitloom/assemble/spdx3/ai.py @@ -12,6 +12,7 @@ from spdx_python_model import v3_0_1 as spdx3 from pitloom.assemble.spdx3.dataset import add_datasets_for_model +from pitloom.assemble.spdx3.deps import build_license_elements from pitloom.core.ai_metadata import AiModelMetadata from pitloom.core.models import generate_spdx_id from pitloom.export.spdx3_json import Spdx3JsonExporter @@ -207,6 +208,22 @@ def add_ai_models( exporter=exporter, ) + if ai_model.license: + rel_declared, rel_concluded = build_license_elements( + license_id=ai_model.license, + package_spdx_id=ai_pkg.spdxId, + license_provenance=ai_model.provenance.get( + "license", + "Source: model file / Hugging Face Hub", + ), + creation_info=creation_info, + doc_name=doc_name, + doc_uuid=doc_uuid, + exporter=exporter, + ) + exporter.add_relationship(rel_declared) + exporter.add_relationship(rel_concluded) + rel = spdx3.Relationship( spdxId=generate_spdx_id( "Relationship", diff --git a/src/pitloom/assemble/spdx3/document.py b/src/pitloom/assemble/spdx3/document.py index 75e5762..43a80bc 100644 --- a/src/pitloom/assemble/spdx3/document.py +++ b/src/pitloom/assemble/spdx3/document.py @@ -317,6 +317,14 @@ def build(doc: DocumentModel, merkle_root: str | None = None) -> Spdx3JsonExport spdx_doc.profileConformance.append(spdx3.ProfileIdentifierType.ai) if any(m.datasets for m in doc.ai_models): spdx_doc.profileConformance.append(spdx3.ProfileIdentifierType.dataset) + if ( + any(m.license for m in doc.ai_models) + and spdx3.ProfileIdentifierType.simpleLicensing + not in spdx_doc.profileConformance + ): + spdx_doc.profileConformance.append( + spdx3.ProfileIdentifierType.simpleLicensing + ) add_ai_models( ai_models=doc.ai_models, main_package_spdx_id=main_package.spdxId, @@ -404,6 +412,22 @@ def build_model( ai_pkg = _build_ai_package(model, spdx_ci, doc_name, doc_uuid) exporter.add_package(ai_pkg) + if model.license: + rel_declared, rel_concluded = build_license_elements( + license_id=model.license, + package_spdx_id=ai_pkg.spdxId, + license_provenance=model.provenance.get( + "license", + "Source: model file / Hugging Face Hub", + ), + creation_info=spdx_ci, + doc_name=doc_name, + doc_uuid=doc_uuid, + exporter=exporter, + ) + exporter.add_relationship(rel_declared) + exporter.add_relationship(rel_concluded) + if model.datasets: add_datasets_for_model( ai_package_spdx_id=ai_pkg.spdxId, @@ -431,6 +455,8 @@ def build_model( spdx3.ProfileIdentifierType.software, spdx3.ProfileIdentifierType.ai, ] + if model.license: + spdx_doc.profileConformance.append(spdx3.ProfileIdentifierType.simpleLicensing) if model.datasets: spdx_doc.profileConformance.append(spdx3.ProfileIdentifierType.dataset) diff --git a/src/pitloom/extract/_huggingface.py b/src/pitloom/extract/_huggingface.py index c60fccd..01d3128 100644 --- a/src/pitloom/extract/_huggingface.py +++ b/src/pitloom/extract/_huggingface.py @@ -2,9 +2,9 @@ # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 -"""HuggingFace model repository metadata extractor. +"""Hugging Face model repository metadata extractor. -Fetches model metadata from a HuggingFace Hub repository URL or model ID +Fetches model metadata from a Hugging Face Hub repository URL or model ID and maps it to :class:`~pitloom.core.ai_metadata.AiModelMetadata`. Sources used (all optional - missing files are silently skipped): @@ -173,7 +173,7 @@ def parse_hf_model_id(source: str) -> str | None: """Return the HF model ID (``owner/name``) from a URL or direct ID. - Returns ``None`` when *source* does not look like a HuggingFace reference. + Returns ``None`` when *source* does not look like a Hugging Face reference. The check is intentionally conservative: the ``owner/name`` pattern is only accepted when the path does *not* exist on the local filesystem, to avoid misidentifying relative project paths like ``models/my_model``. @@ -193,7 +193,7 @@ def parse_hf_model_id(source: str) -> str | None: def is_huggingface_source(source: str) -> bool: - """Return ``True`` when *source* is a HuggingFace URL or model ID.""" + """Return ``True`` when *source* is a Hugging Face URL or model ID.""" return parse_hf_model_id(source) is not None @@ -317,7 +317,7 @@ def _detect_license_from_hf_files( if detected: return ( detected, - f"Source: HuggingFace Hub | File: {filename}" + f"Source: Hugging Face Hub | File: {filename}" " | Method: licenseid_detection", ) @@ -404,7 +404,7 @@ def _extract_description( return None desc = _extract_card_description(card_text) if desc: - provenance["description"] = "Source: HuggingFace Hub | Field: model card" + provenance["description"] = "Source: Hugging Face Hub | Field: model card" return desc @@ -425,7 +425,7 @@ def _resolve_license( if raw_license_str and raw_license_str.lower() not in _VAGUE_LICENSE_VALUES: provenance["license"] = ( - "Source: HuggingFace Hub | Field: model card YAML (license)" + "Source: Hugging Face Hub | Field: model card YAML (license)" ) return raw_license_str, None @@ -437,7 +437,7 @@ def _resolve_license( detected_id, detected_src = _detect_license_from_hf_files(model_id) if detected_id: provenance["license"] = detected_src or ( - "Source: HuggingFace Hub | Method: licenseid_detection" + "Source: Hugging Face Hub | Method: licenseid_detection" ) return detected_id, vague_raw return None, vague_raw @@ -458,13 +458,13 @@ def _parse_config_data( if model_type: type_of_model = str(model_type) provenance["type_of_model"] = ( - "Source: HuggingFace Hub | Field: config.json (model_type)" + "Source: Hugging Face Hub | Field: config.json (model_type)" ) architectures = config.get("architectures") if isinstance(architectures, list) and architectures: architecture = str(architectures[0]) provenance["architecture"] = ( - "Source: HuggingFace Hub | Field: config.json (architectures)" + "Source: Hugging Face Hub | Field: config.json (architectures)" ) hyperparameters: dict[str, Any] = {} @@ -480,7 +480,7 @@ def _parse_config_data( hyperparameters[f"generation.{key}"] = val if hyperparameters: provenance["hyperparameters"] = ( - "Source: HuggingFace Hub | Field: config.json / generation_config.json" + "Source: Hugging Face Hub | Field: config.json / generation_config.json" ) return type_of_model, architecture, hyperparameters @@ -529,7 +529,7 @@ def _extract_domains( if pipeline_tag: usage_domains.append(str(pipeline_tag)) provenance["domain"] = ( - "Source: HuggingFace Hub | Field: model card YAML (pipeline_tag)" + "Source: Hugging Face Hub | Field: model card YAML (pipeline_tag)" ) for tag in card_data.get("tags") or []: tag_str = str(tag) @@ -558,7 +558,7 @@ def _extract_datasets( download_url=f"https://huggingface.co/datasets/{ds_name}", provenance={ "name": ( - "Source: HuggingFace Hub" + "Source: Hugging Face Hub" " | Field: model card YAML (datasets)" ) }, @@ -566,7 +566,7 @@ def _extract_datasets( ) ) provenance["datasets"] = ( - "Source: HuggingFace Hub | Field: model card YAML (datasets)" + "Source: Hugging Face Hub | Field: model card YAML (datasets)" ) elif info_dataset_ids: for ds_name in info_dataset_ids: @@ -578,7 +578,7 @@ def _extract_datasets( download_url=f"https://huggingface.co/datasets/{ds_name}", provenance={ "name": ( - "Source: HuggingFace Hub" + "Source: Hugging Face Hub" " | Field: model_info tags (dataset:*)" ) }, @@ -586,7 +586,7 @@ def _extract_datasets( ) ) provenance["datasets"] = ( - "Source: HuggingFace Hub | Field: model_info tags (dataset:*)" + "Source: Hugging Face Hub | Field: model_info tags (dataset:*)" ) return datasets @@ -662,19 +662,19 @@ def _build_extra_data( if base_model_id: extra_data["hf.base_model"] = base_model_id provenance["base_model"] = ( - "Source: HuggingFace Hub | Field: model card YAML (base_model)" + "Source: Hugging Face Hub | Field: model card YAML (base_model)" ) if tag_data.base_model_relation: extra_data["hf.base_model_relation"] = tag_data.base_model_relation provenance["base_model_relation"] = ( - "Source: HuggingFace Hub | Field: model_info tags (base_model:relation)" + "Source: Hugging Face Hub | Field: model_info tags (base_model:relation)" ) if tag_data.doi_val: extra_data["hf.doi"] = tag_data.doi_val - provenance["doi"] = "Source: HuggingFace Hub | Field: model_info tags (doi:*)" + provenance["doi"] = "Source: Hugging Face Hub | Field: model_info tags (doi:*)" if extra_data: provenance["extra_data"] = ( - "Source: HuggingFace Hub" + "Source: Hugging Face Hub" " | Field: hub API / model card / tokenizer_config.json" ) return extra_data @@ -716,7 +716,7 @@ def _build_extra_lists( if extra_lists: provenance["extra_lists"] = ( - "Source: HuggingFace Hub" + "Source: Hugging Face Hub" " | Field: model card YAML (language / tags)" " / model_info tags (arxiv:*)" ) @@ -729,7 +729,7 @@ def _build_extra_lists( def read_huggingface(source: str) -> AiModelMetadata: - """Extract metadata from a HuggingFace model repository. + """Extract metadata from a Hugging Face model repository. Args: source: Full HF URL @@ -745,7 +745,7 @@ def read_huggingface(source: str) -> AiModelMetadata: Raises: ImportError: If ``huggingface_hub`` is not installed. - ValueError: If *source* is not a valid HuggingFace URL or model ID. + ValueError: If *source* is not a valid Hugging Face URL or model ID. """ try: # pylint: disable=import-outside-toplevel @@ -753,17 +753,17 @@ def read_huggingface(source: str) -> AiModelMetadata: except ImportError as exc: raise ImportError( "The 'huggingface_hub' package is required " - "to extract HuggingFace model metadata. " + "to extract Hugging Face model metadata. " "Install it with: pip install pitloom[huggingface]" ) from exc model_id = parse_hf_model_id(source) if model_id is None: - raise ValueError(f"Not a valid HuggingFace URL or model ID: {source!r}") + raise ValueError(f"Not a valid Hugging Face URL or model ID: {source!r}") hf_url = f"https://huggingface.co/{model_id}" model_name = model_id.split("/")[-1] - provenance: dict[str, str] = {"name": "Source: HuggingFace Hub | Field: model_id"} + provenance: dict[str, str] = {"name": "Source: Hugging Face Hub | Field: model_id"} hf_data = _fetch_all_hf_data(model_id) description = _extract_description(hf_data, provenance) diff --git a/tests/fixtures/README.md b/tests/fixtures/README.md index 506d9b4..a3e9cb2 100644 --- a/tests/fixtures/README.md +++ b/tests/fixtures/README.md @@ -72,12 +72,12 @@ dependency is not installed or the file is absent. | `safetensors/vits-tiny-random.safetensors` | Safetensors | Text-to-speech - VITS (random weights) | Apache-2.0 | | `safetensors/whisper-tiny-random.safetensors` | Safetensors | Speech recognition - Whisper (random weights) | Apache-2.0 | -## HuggingFace Hub mock fixtures +## Hugging Face Hub mock fixtures -`tests/test_extract_huggingface.py` exercises the HuggingFace metadata extractor +`tests/test_extract_huggingface.py` exercises the Hugging Face metadata extractor (`pitloom.extract._huggingface`) entirely through mocks - no network calls are made. Each model is represented by inline Python dicts that mirror the real -HuggingFace API responses (`config.json`, `tokenizer_config.json`, +Hugging Face API responses (`config.json`, `tokenizer_config.json`, `generation_config.json`, and the model card YAML frontmatter). The models were chosen to cover different configurations, access restrictions, diff --git a/tests/test_extract_huggingface.py b/tests/test_extract_huggingface.py index ec579c0..5e130df 100644 --- a/tests/test_extract_huggingface.py +++ b/tests/test_extract_huggingface.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 # pylint: disable=too-many-lines -"""Tests for the HuggingFace model metadata extractor.""" +"""Tests for the Hugging Face model metadata extractor.""" from __future__ import annotations @@ -543,7 +543,9 @@ def test_license_from_file_when_card_has_none() -> None: "pitloom.extract._huggingface._detect_license_from_hf_files", return_value=( "Apache-2.0", - "Source: HuggingFace Hub | File: LICENSE | Method: licenseid_detection", + "Source: Hugging Face Hub " + "| File: LICENSE " + "| Method: licenseid_detection", ), ): meta = read_huggingface("org/model") @@ -559,7 +561,9 @@ def test_license_from_file_when_card_says_other() -> None: "pitloom.extract._huggingface._detect_license_from_hf_files", return_value=( "MIT", - "Source: HuggingFace Hub | File: LICENSE | Method: licenseid_detection", + "Source: Hugging Face Hub " + "| File: LICENSE " + "| Method: licenseid_detection", ), ): meta = read_huggingface("org/model") @@ -611,7 +615,7 @@ def test_detect_license_from_hf_files_returns_none_on_empty_file( def test_read_huggingface_invalid_source_raises() -> None: - with pytest.raises(ValueError, match="Not a valid HuggingFace"): + with pytest.raises(ValueError, match="Not a valid Hugging Face"): read_huggingface("/path/to/not/a/hf/model") @@ -632,7 +636,7 @@ def test_read_huggingface_no_huggingface_hub_raises() -> None: # Model zoo: varied real-world profiles (all mocked, no network calls) # =========================================================================== # -# Each fixture captures the real card/config data observed from HuggingFace Hub +# Each fixture captures the real card/config data observed from Hugging Face Hub # on 2026-05-08. They exercise distinct characteristics: # # Kokoro-82M - TTS, custom config schema (no model_type/architectures) @@ -932,7 +936,7 @@ def test_kimi_vague_license_triggers_file_detection() -> None: detected_mock = MagicMock( return_value=( "MIT", - "Source: HuggingFace Hub | File: LICENSE | Method: licenseid_detection", + "Source: Hugging Face Hub | File: LICENSE | Method: licenseid_detection", ) ) with _patch_kimi(): diff --git a/tests/test_generator.py b/tests/test_generator.py index 5de0369..c57104b 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -8,17 +8,20 @@ import tempfile from datetime import datetime, timezone from pathlib import Path +from typing import Any +import pytest from spdx_python_model import v3_0_1 as spdx3 from pitloom.assemble import generate_sbom -from pitloom.assemble.spdx3.document import build +from pitloom.assemble.spdx3.document import build, build_model from pitloom.core.ai_metadata import AiModelFormat, AiModelFormatInfo, AiModelMetadata from pitloom.core.creation import CreationMetadata from pitloom.core.document import DocumentModel from pitloom.core.models import generate_spdx_id from pitloom.core.project import ProjectMetadata from pitloom.export.spdx3_json import Spdx3JsonExporter +from pitloom.extract.ai_model import read_ai_model def test_generate_sbom_basic() -> None: @@ -367,3 +370,253 @@ def test_assembler_ai_model_with_inputs_outputs() -> None: contains_rels = [r for r in rels if r.get("relationshipType") == "contains"] assert len(contains_rels) == 1 assert any(pkg["spdxId"] in r["to"] for r in contains_rels) + + # no license relationships when ai_model.license is not set + license_rels = [ + r + for r in rels + if r.get("relationshipType") in ("hasDeclaredLicense", "hasConcludedLicense") + ] + assert not license_rels + + +# --------------------------------------------------------------------------- +# License relationship tests +# --------------------------------------------------------------------------- +# Each (model_name, license_id, hf_id) triple is taken from the model zoo in +# test_extract_huggingface.py, which records the actual values observed on +# Hugging Face Hub on 2026-05-08. Using real identifiers ensures the assembly +# layer is exercised with the full range of license strings found in practice: +# standard SPDX IDs, custom Hugging Face identifiers, and OpenRAIL variants. +# --------------------------------------------------------------------------- + +_AI_LICENSE_CASES: list[tuple[str, str, str]] = [ + # standard SPDX identifiers + ("Kokoro-82M", "apache-2.0", "hexgrad/Kokoro-82M"), + ("DeepSeek-R1", "mit", "deepseek-ai/DeepSeek-R1"), + # non-standard / custom Hugging Face license identifiers + ("starcoder2-3b", "bigcode-openrail-m", "bigcode/starcoder2-3b"), + ("Llama-3.2-1B", "llama3.2", "meta-llama/Llama-3.2-1B"), +] + + +def _check_license_relationships( + graph: list[dict[str, Any]], ai_pkg_id: str, license_id: str +) -> None: + """Assert hasDeclaredLicense and hasConcludedLicense relationships exist.""" + rels = [e for e in graph if e.get("type") == "Relationship"] + declared = [ + r + for r in rels + if r.get("relationshipType") == "hasDeclaredLicense" + and r.get("from") == ai_pkg_id + ] + concluded = [ + r + for r in rels + if r.get("relationshipType") == "hasConcludedLicense" + and r.get("from") == ai_pkg_id + ] + assert len(declared) == 1, "expected one hasDeclaredLicense relationship" + assert len(concluded) == 1, "expected one hasConcludedLicense relationship" + + license_spdx_id = declared[0]["to"][0] + license_elems = [ + e + for e in graph + if e.get("type") == "simplelicensing_SimpleLicensingText" + and e.get("spdxId") == license_spdx_id + ] + assert len(license_elems) == 1 + assert license_elems[0]["simplelicensing_licenseText"] == license_id + + spdx_docs = [e for e in graph if e.get("type") == "SpdxDocument"] + assert "simpleLicensing" in spdx_docs[0]["profileConformance"] + + +@pytest.mark.parametrize( + "model_name,license_id,hf_id", + _AI_LICENSE_CASES, + ids=[f"{n}-{lic}" for n, lic, _ in _AI_LICENSE_CASES], +) +def test_assembler_ai_model_with_license( + model_name: str, license_id: str, hf_id: str +) -> None: + """AI model with a license must produce hasDeclaredLicense and + hasConcludedLicense relationships, and simpleLicensing in profileConformance. + + Model/license pairs are taken from real Hugging Face Hub data recorded in + the model zoo (test_extract_huggingface.py, 2026-05-08). + """ + project = ProjectMetadata(name="ai-project", version="0.1.0") + ai_model = AiModelMetadata( + format_info=AiModelFormatInfo(model_format=AiModelFormat.SAFETENSORS), + name=model_name, + license=license_id, + provenance={ + "license": (f"Source: Hugging Face Hub ({hf_id}) | Field: cardData.license") + }, + ) + doc = DocumentModel( + project=project, creation=CreationMetadata(), ai_models=[ai_model] + ) + + exporter = build(doc) + data = json.loads(exporter.to_json(pretty=True)) + graph = data["@graph"] + + ai_pkgs = [e for e in graph if e.get("type") == "ai_AIPackage"] + assert len(ai_pkgs) == 1 + _check_license_relationships(graph, ai_pkgs[0]["spdxId"], license_id) + + +# Standalone build_model() cases: real GGUF and safetensors models. +# aisingapore/Gemma-SEA-LION-v4-4B-VL-GGUF uses a custom "gemma" license; +# deepseek-ai/DeepSeek-R1 uses the standard SPDX "mit" identifier. +_BUILD_MODEL_LICENSE_CASES: list[tuple[str, str, AiModelFormat, str]] = [ + ( + "Gemma-SEA-LION-v4-4B-VL-GGUF", + "gemma", + AiModelFormat.GGUF, + "aisingapore/Gemma-SEA-LION-v4-4B-VL-GGUF", + ), + ( + "DeepSeek-R1", + "mit", + AiModelFormat.SAFETENSORS, + "deepseek-ai/DeepSeek-R1", + ), +] + + +@pytest.mark.parametrize( + "model_name,license_id,fmt,hf_id", + _BUILD_MODEL_LICENSE_CASES, + ids=[f"{n}-{lic}" for n, lic, _, _ in _BUILD_MODEL_LICENSE_CASES], +) +def test_build_model_with_license( + model_name: str, license_id: str, fmt: AiModelFormat, hf_id: str +) -> None: + """build_model() for a standalone AI model must emit license relationships + and include simpleLicensing in profileConformance. + + Model/license pairs are taken from real Hugging Face Hub data recorded in + the model zoo (test_extract_huggingface.py, 2026-05-08). + """ + model = AiModelMetadata( + format_info=AiModelFormatInfo(model_format=fmt), + name=model_name, + license=license_id, + provenance={ + "license": (f"Source: Hugging Face Hub ({hf_id}) | Field: cardData.license") + }, + ) + + exporter = build_model(model, CreationMetadata()) + data = json.loads(exporter.to_json(pretty=True)) + graph = data["@graph"] + + ai_pkgs = [e for e in graph if e.get("type") == "ai_AIPackage"] + assert len(ai_pkgs) == 1 + _check_license_relationships(graph, ai_pkgs[0]["spdxId"], license_id) + + +def test_build_model_without_license() -> None: + """build_model() for a model with no license produces no license + relationships and no simpleLicensing in profileConformance. + + microsoft/resnet-18 is a real Hugging Face model that does not declare a + license in its model card, making it a realistic no-license test case. + """ + model = AiModelMetadata( + format_info=AiModelFormatInfo(model_format=AiModelFormat.ONNX), + name="resnet-18", + ) + + exporter = build_model(model, CreationMetadata()) + data = json.loads(exporter.to_json(pretty=True)) + graph = data["@graph"] + + rels = [e for e in graph if e.get("type") == "Relationship"] + license_rels = [ + r + for r in rels + if r.get("relationshipType") in ("hasDeclaredLicense", "hasConcludedLicense") + ] + assert not license_rels + + spdx_docs = [e for e in graph if e.get("type") == "SpdxDocument"] + assert "simpleLicensing" not in spdx_docs[0]["profileConformance"] + + +# --------------------------------------------------------------------------- +# Fixture-based end-to-end license export tests +# --------------------------------------------------------------------------- +# These tests extract real metadata from local model files in tests/fixtures/, +# then assemble a standalone SPDX 3 document and verify the license +# relationships are present in the output. +# +# Many fixture files do not embed license metadata in their format (e.g. most +# Safetensors files only store {"format": "pt"} in __metadata__, and the GGUF +# extractor does not map general.license to ai_model.license). Those fixtures +# are skipped at runtime via pytest.skip() rather than excluded from the +# parametrize list, so that newly enhanced extractors will be picked up +# automatically without any change to this test. +# --------------------------------------------------------------------------- + +_FIXTURE_ROOT = Path(__file__).parent / "fixtures" +_AI_MODEL_DIRS = [ + "fasttext", + "gguf", + "hdf5", + "keras", + "numpy", + "onnx", + "pytorch", + "pytorch_pt2", + "safetensors", +] +_AI_MODEL_FIXTURES: list[Path] = [ + p + for d in _AI_MODEL_DIRS + for p in sorted((_FIXTURE_ROOT / d).glob("*")) + if p.is_file() and p.suffix != "" +] + + +@pytest.mark.parametrize( + "fixture_path", + _AI_MODEL_FIXTURES, + ids=[f"{p.parent.name}/{p.name}" for p in _AI_MODEL_FIXTURES], +) +def test_fixture_license_export(fixture_path: Path) -> None: + """Extract metadata from a fixture file and verify SPDX 3 license output. + + Skips when: + - The fixture file is absent from the repository clone. + - The required optional library is not installed. + - The model format does not embed a license (``meta.license is None``). + + When a license is present, asserts that the assembled ``build_model()`` + output contains both ``hasDeclaredLicense`` and ``hasConcludedLicense`` + relationships pointing to a ``simplelicensing_SimpleLicensingText`` element + whose ``simplelicensing_licenseText`` matches the extracted license string. + """ + if not fixture_path.exists(): + pytest.skip(f"Fixture not found: {fixture_path}") + + try: + meta = read_ai_model(fixture_path) + except ImportError as exc: + pytest.skip(str(exc)) + + if meta.license is None: + pytest.skip(f"No license metadata embedded in {fixture_path.name}") + + exporter = build_model(meta, CreationMetadata()) + data = json.loads(exporter.to_json(pretty=True)) + graph = data["@graph"] + + ai_pkgs = [e for e in graph if e.get("type") == "ai_AIPackage"] + assert len(ai_pkgs) == 1 + _check_license_relationships(graph, ai_pkgs[0]["spdxId"], meta.license) diff --git a/tests/test_main_cli.py b/tests/test_main_cli.py index f59b090..e67122e 100644 --- a/tests/test_main_cli.py +++ b/tests/test_main_cli.py @@ -498,7 +498,7 @@ def test_model_mode_onnx_sbom_root_is_ai_package( # --------------------------------------------------------------------------- -# -m / --aimodel: HuggingFace URL / model-ID mode tests (mocked) +# -m / --aimodel: Hugging Face URL / model-ID mode tests (mocked) # ---------------------------------------------------------------------------