cdgriffith · cdgriffith · Apr 8, 2026 · Apr 6, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,18 @@
 Changelog
 =========
 
+Version 2.2.0
+-------------
+
+- Adding Ogg scanner to distinguish Vorbis, Opus, Theora, FLAC, Speex, and OGM codecs
+- Adding ASF scanner to distinguish WMV (video) from WMA (audio) files
+- Adding EBML scanner to distinguish Matroska (.mkv) from WebM (.webm) files
+- Adding MSI (Windows Installer) and MPP (Microsoft Project) detection to CFBF scanner
+- Fixing #146 OOXML detection now uses `[Content_Types].xml` content types as primary method, correctly identifying docx/xlsx/pptx files from LibreOffice, Google Docs, and other non-Microsoft tools (thanks to jonasdeboeck79)
+- Fixing ZIP deep scan now inspects all ZIP files instead of short-circuiting on .zip extension
+- Fixing text scanner now treats files containing NUL bytes as binary data instead of misidentifying them as text
+- Fixing mz5 HDF5 scanner typo in chromatogram dataset name
+
 Version 2.1.1
 -------------
 

diff --git a/puremagic/main.py b/puremagic/main.py
@@ -32,10 +32,13 @@
         mpeg_audio_scanner,
         hdf5_scanner,
         cfbf_scanner,
+        ogg_scanner,
+        asf_scanner,
+        ebml_scanner,
     )
 
 __author__ = "Chris Griffith"
-__version__ = "2.1.1"
+__version__ = "2.2.0"
 __all__ = [
     "magic_file",
     "magic_string",
@@ -206,10 +209,12 @@ def identify_all(header: bytes, footer: bytes, ext=None) -> list[PureMagicWithCo
     return determine_confidence(matches, ext)
 
 
-def perform_magic(header: bytes, footer: bytes, mime: bool, ext=None, filename=None) -> str:
+def perform_magic(header: bytes | None, footer: bytes | None, mime: bool | None, ext=None, filename=None) -> str:
     """Discover what type of file it is based on the incoming string"""
     if not header:
         raise PureValueError("Input was empty")
+    if not footer:
+        footer = b""
     infos = identify_all(header, footer, ext)
     if filename and os.path.isfile(filename) and os.getenv("PUREMAGIC_DEEPSCAN") != "0":
         results = run_deep_scan(infos, filename, header, footer, raise_on_none=not infos)
@@ -466,6 +471,14 @@ def single_deep_scan(
                 return result
         case cfbf_scanner.match_bytes | cfbf_scanner.match_bytes_short:
             return cfbf_scanner.main(filename, head, foot)
+        case ogg_scanner.match_bytes:
+            result = ogg_scanner.main(filename, head, foot)
+            if result and result.confidence > confidence:
+                return result
+        case asf_scanner.match_bytes:
+            return asf_scanner.main(filename, head, foot)
+        case ebml_scanner.match_bytes:
+            return ebml_scanner.main(filename, head, foot)
 
     if eml_result := text_scanner.eml_check(head):
         return eml_result

diff --git a/puremagic/scanners/asf_scanner.py b/puremagic/scanners/asf_scanner.py
@@ -0,0 +1,59 @@
+import os
+import struct
+
+from puremagic.scanners.helpers import Match
+
+# ASF Header Object GUID
+match_bytes = b"\x30\x26\xb2\x75\x8e\x66\xcf\x11\xa6\xd9\x00\xaa\x00\x62\xce\x6c"
+
+_STREAM_PROPS_GUID = b"\x91\x07\xdc\xb7\xb7\xa9\xcf\x11\x8e\xe6\x00\xc0\x0c\x20\x53\x65"
+_AUDIO_MEDIA_GUID = b"\x40\x9e\x69\xf8\x4d\x5b\xcf\x11\xa8\xfd\x00\x80\x5f\x5c\x44\x2b"
+_VIDEO_MEDIA_GUID = b"\xc0\xef\x19\xbc\x4d\x5b\xcf\x11\xa8\xfd\x00\x80\x5f\x5c\x44\x2b"
+
+
+def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None:
+    if not head or len(head) < 30:
+        return None
+    if head[:16] != match_bytes:
+        return None
+
+    header_size = struct.unpack_from("<Q", head, 16)[0]
+    obj_count = struct.unpack_from("<I", head, 24)[0]
+
+    # Read the full ASF header if our head buffer is too small
+    if header_size > len(head):
+        try:
+            with open(file_path, "rb") as f:
+                data = f.read(min(int(header_size), 65536))
+        except (OSError, ValueError):
+            return None
+    else:
+        data = head
+
+    has_audio = False
+    has_video = False
+    offset = 30  # Past header GUID(16) + size(8) + count(4) + reserved(2)
+
+    for _ in range(min(obj_count, 50)):
+        if offset + 24 > len(data):
+            break
+        obj_guid = data[offset : offset + 16]
+        obj_size = struct.unpack_from("<Q", data, offset + 16)[0]
+        if obj_size < 24:
+            break
+
+        if obj_guid == _STREAM_PROPS_GUID and offset + 40 <= len(data):
+            stream_type = data[offset + 24 : offset + 40]
+            if stream_type == _VIDEO_MEDIA_GUID:
+                has_video = True
+            elif stream_type == _AUDIO_MEDIA_GUID:
+                has_audio = True
+
+        offset += int(obj_size)
+
+    if has_video:
+        return Match(".wmv", "Windows Media Video", "video/x-ms-wmv")
+    if has_audio:
+        return Match(".wma", "Windows Media Audio", "audio/x-ms-wma")
+
+    return Match(".asf", "Advanced Systems Format", "video/x-ms-asf")
diff --git a/puremagic/scanners/cfbf_scanner.py b/puremagic/scanners/cfbf_scanner.py
@@ -24,6 +24,32 @@
     ("__substg1.0_", ".msg", "Outlook Message", "application/vnd.ms-outlook"),
 ]
 
+# Multi-stream detection: all listed streams must be present.
+# Each entry: (required_streams, extension, name, mime_type)
+_MULTI_STREAM_MATCHES = [
+    (("_StringPool", "_StringData"), ".msi", "Windows Installer Package", "application/x-msi"),
+]
+
+# Root directory entry CLSIDs that identify specific formats.
+# CLSIDs are stored in mixed-endian format in CFBF files.
+# Each entry: (clsid_bytes, extension, name, mime_type)
+_CLSID_MATCHES = [
+    # Microsoft Project 98/2000/2002/2003: {74b78f3a-c8c8-11d1-be11-00c04fb6faf1}
+    (
+        b"\x3a\x8f\xb7\x74\xc8\xc8\xd1\x11\xbe\x11\x00\xc0\x4f\xb6\xfa\xf1",
+        ".mpp",
+        "Microsoft Project",
+        "application/vnd.ms-project",
+    ),
+    # Microsoft Project 4.x: {72fd3320-9a05-11cf-85a4-00a0c904de5f}
+    (
+        b"\x20\x33\xfd\x72\x05\x9a\xcf\x11\x85\xa4\x00\xa0\xc9\x04\xde\x5f",
+        ".mpp",
+        "Microsoft Project",
+        "application/vnd.ms-project",
+    ),
+]
+
 
 def _extract_stream_names(dir_data: bytes) -> set[str]:
     """Parse CFBF directory entries and return the set of stream/storage names."""
@@ -45,8 +71,19 @@ def _extract_stream_names(dir_data: bytes) -> set[str]:
     return names
 
 
-def _identify_format(stream_names: set[str]) -> Match | None:
-    """Match stream names against known CFBF format signatures."""
+def _extract_root_clsid(dir_data: bytes) -> bytes | None:
+    """Extract the CLSID from the root directory entry (obj_type 5)."""
+    for i in range(0, len(dir_data), 128):
+        entry = dir_data[i : i + 128]
+        if len(entry) < 96:
+            break
+        if entry[66] == 5:  # Root storage
+            return entry[80:96]
+    return None
+
+
+def _identify_format(stream_names: set[str], dir_data: bytes) -> Match | None:
+    """Match stream names and CLSIDs against known CFBF format signatures."""
     # Check prefix matches first (e.g. __substg1.0_ for MSG)
     for name in stream_names:
         for prefix, ext, fmt_name, mime in _PREFIX_MATCHES:
@@ -58,6 +95,18 @@ def _identify_format(stream_names: set[str]) -> Match | None:
         if stream_name in stream_names:
             return Match(ext, fmt_name, mime)
 
+    # Check multi-stream matches (all required streams must be present)
+    for required_streams, ext, fmt_name, mime in _MULTI_STREAM_MATCHES:
+        if all(s in stream_names for s in required_streams):
+            return Match(ext, fmt_name, mime)
+
+    # Check root CLSID
+    root_clsid = _extract_root_clsid(dir_data)
+    if root_clsid:
+        for clsid, ext, fmt_name, mime in _CLSID_MATCHES:
+            if root_clsid == clsid:
+                return Match(ext, fmt_name, mime)
+
     return None
 
 
@@ -94,4 +143,4 @@ def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None:
         return None
 
     stream_names = _extract_stream_names(dir_data)
-    return _identify_format(stream_names)
+    return _identify_format(stream_names, dir_data)
diff --git a/puremagic/scanners/ebml_scanner.py b/puremagic/scanners/ebml_scanner.py
@@ -0,0 +1,21 @@
+import os
+
+from puremagic.scanners.helpers import Match
+
+match_bytes = b"\x1a\x45\xdf\xa3"
+
+
+def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None:
+    if not head or len(head) < 8:
+        return None
+    if head[:4] != match_bytes:
+        return None
+
+    # Search for DocType string in the EBML header (first 64 bytes)
+    search_area = head[:64]
+    if b"webm" in search_area:
+        return Match(".webm", "WebM Video", "video/webm")
+    if b"matroska" in search_area:
+        return Match(".mkv", "Matroska Video", "video/x-matroska")
+
+    return None
diff --git a/puremagic/scanners/hdf5_scanner.py b/puremagic/scanners/hdf5_scanner.py
@@ -32,7 +32,7 @@
         "application/x-biom2",
     ),
     # mz5 - mass spectrometry
-    ([], [b"/SpectrumMetaData", b"/ChomatogramMetaData"], 1, ".mz5", "mz5 mass spectrometry data", "application/x-mz5"),
+    ([], [b"/SpectrumMetaData", b"/ChromatogramList"], 1, ".mz5", "mz5 mass spectrometry data", "application/x-mz5"),
     # h5mlm - ML model
     ([], [b"model_type", b"h5mlm"], 1, ".h5mlm", "HDF5 ML model", "application/x-h5mlm"),
 ]

diff --git a/puremagic/scanners/ogg_scanner.py b/puremagic/scanners/ogg_scanner.py
@@ -0,0 +1,39 @@
+import os
+
+from puremagic.scanners.helpers import Match
+
+match_bytes = b"OggS"
+
+# Ogg codec identification signatures found at the start of the first page payload.
+# Each entry: (codec_id_bytes, extension, name, mime_type)
+_OGG_CODEC_MAP = [
+    (b"\x01vorbis", ".ogg", "Ogg Vorbis Audio", "audio/ogg"),
+    (b"OpusHead", ".opus", "Ogg Opus Audio", "audio/ogg"),
+    (b"\x80theora", ".ogv", "Ogg Theora Video", "video/ogg"),
+    (b"\x7fFLAC", ".oga", "Ogg FLAC Audio", "audio/ogg"),
+    (b"Speex   ", ".spx", "Ogg Speex Audio", "audio/ogg"),
+    (b"fishead\x00", ".ogv", "Ogg Annodex", "video/ogg"),
+    (b"\x01video", ".ogm", "OGM Video", "video/x-ogm+ogg"),
+]
+
+
+def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None:
+    if not head or len(head) < 28:
+        return None
+
+    # Verify OggS capture pattern, version 0, and beginning-of-stream flag
+    if head[:4] != match_bytes or head[4] != 0 or not (head[5] & 0x02):
+        return None
+
+    seg_count = head[26]
+    payload_start = 27 + seg_count
+
+    if payload_start >= len(head):
+        return None
+
+    payload = head[payload_start:]
+    for codec_id, ext, name, mime in _OGG_CODEC_MAP:
+        if payload.startswith(codec_id):
+            return Match(ext, name, mime, confidence=0.9)
+
+    return None
diff --git a/puremagic/scanners/text_scanner.py b/puremagic/scanners/text_scanner.py
@@ -200,6 +200,10 @@ def main(file_path: os.PathLike | str, _, __) -> Match | None:
     if len(head) < 8:
         return Match("", "very short file", "application/octet-stream", confidence=0.5)
 
+    # NUL bytes indicate binary data, but skip this check for UTF-16 (which has NUL bytes naturally)
+    if b"\x00" in head and head[:2] not in (b"\xff\xfe", b"\xfe\xff"):
+        return Match("", "data", "application/octet-stream", confidence=0.5)
+
     try:
         text, encoding = decode_any(head)
     except TypeError: