Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
Changelog
=========

Version 2.2.0
-------------

- Adding Ogg scanner to distinguish Vorbis, Opus, Theora, FLAC, Speex, and OGM codecs
- Adding ASF scanner to distinguish WMV (video) from WMA (audio) files
- Adding EBML scanner to distinguish Matroska (.mkv) from WebM (.webm) files
- Adding MSI (Windows Installer) and MPP (Microsoft Project) detection to CFBF scanner
- Fixing #146 OOXML detection now uses `[Content_Types].xml` content types as primary method, correctly identifying docx/xlsx/pptx files from LibreOffice, Google Docs, and other non-Microsoft tools (thanks to jonasdeboeck79)
- Fixing ZIP deep scan now inspects all ZIP files instead of short-circuiting on .zip extension
- Fixing text scanner now treats files containing NUL bytes as binary data instead of misidentifying them as text
- Fixing mz5 HDF5 scanner typo in chromatogram dataset name

Version 2.1.1
-------------

Expand Down
17 changes: 15 additions & 2 deletions puremagic/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,13 @@
mpeg_audio_scanner,
hdf5_scanner,
cfbf_scanner,
ogg_scanner,
asf_scanner,
ebml_scanner,
)

__author__ = "Chris Griffith"
__version__ = "2.1.1"
__version__ = "2.2.0"
__all__ = [
"magic_file",
"magic_string",
Expand Down Expand Up @@ -206,10 +209,12 @@ def identify_all(header: bytes, footer: bytes, ext=None) -> list[PureMagicWithCo
return determine_confidence(matches, ext)


def perform_magic(header: bytes, footer: bytes, mime: bool, ext=None, filename=None) -> str:
def perform_magic(header: bytes | None, footer: bytes | None, mime: bool | None, ext=None, filename=None) -> str:
"""Discover what type of file it is based on the incoming string"""
if not header:
raise PureValueError("Input was empty")
if not footer:
footer = b""
infos = identify_all(header, footer, ext)
if filename and os.path.isfile(filename) and os.getenv("PUREMAGIC_DEEPSCAN") != "0":
results = run_deep_scan(infos, filename, header, footer, raise_on_none=not infos)
Expand Down Expand Up @@ -466,6 +471,14 @@ def single_deep_scan(
return result
case cfbf_scanner.match_bytes | cfbf_scanner.match_bytes_short:
return cfbf_scanner.main(filename, head, foot)
case ogg_scanner.match_bytes:
result = ogg_scanner.main(filename, head, foot)
if result and result.confidence > confidence:
return result
case asf_scanner.match_bytes:
return asf_scanner.main(filename, head, foot)
case ebml_scanner.match_bytes:
return ebml_scanner.main(filename, head, foot)

if eml_result := text_scanner.eml_check(head):
return eml_result
Expand Down
59 changes: 59 additions & 0 deletions puremagic/scanners/asf_scanner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
import struct

from puremagic.scanners.helpers import Match

# ASF Header Object GUID
match_bytes = b"\x30\x26\xb2\x75\x8e\x66\xcf\x11\xa6\xd9\x00\xaa\x00\x62\xce\x6c"

_STREAM_PROPS_GUID = b"\x91\x07\xdc\xb7\xb7\xa9\xcf\x11\x8e\xe6\x00\xc0\x0c\x20\x53\x65"
_AUDIO_MEDIA_GUID = b"\x40\x9e\x69\xf8\x4d\x5b\xcf\x11\xa8\xfd\x00\x80\x5f\x5c\x44\x2b"
_VIDEO_MEDIA_GUID = b"\xc0\xef\x19\xbc\x4d\x5b\xcf\x11\xa8\xfd\x00\x80\x5f\x5c\x44\x2b"


def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None:
if not head or len(head) < 30:
return None
if head[:16] != match_bytes:
return None

header_size = struct.unpack_from("<Q", head, 16)[0]
obj_count = struct.unpack_from("<I", head, 24)[0]

# Read the full ASF header if our head buffer is too small
if header_size > len(head):
try:
with open(file_path, "rb") as f:
data = f.read(min(int(header_size), 65536))
except (OSError, ValueError):
return None
else:
data = head

has_audio = False
has_video = False
offset = 30 # Past header GUID(16) + size(8) + count(4) + reserved(2)

for _ in range(min(obj_count, 50)):
if offset + 24 > len(data):
break
obj_guid = data[offset : offset + 16]
obj_size = struct.unpack_from("<Q", data, offset + 16)[0]
if obj_size < 24:
break

if obj_guid == _STREAM_PROPS_GUID and offset + 40 <= len(data):
stream_type = data[offset + 24 : offset + 40]
if stream_type == _VIDEO_MEDIA_GUID:
has_video = True
elif stream_type == _AUDIO_MEDIA_GUID:
has_audio = True

offset += int(obj_size)

if has_video:
return Match(".wmv", "Windows Media Video", "video/x-ms-wmv")
if has_audio:
return Match(".wma", "Windows Media Audio", "audio/x-ms-wma")

return Match(".asf", "Advanced Systems Format", "video/x-ms-asf")
55 changes: 52 additions & 3 deletions puremagic/scanners/cfbf_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,32 @@
("__substg1.0_", ".msg", "Outlook Message", "application/vnd.ms-outlook"),
]

# Multi-stream detection: all listed streams must be present.
# Each entry: (required_streams, extension, name, mime_type)
_MULTI_STREAM_MATCHES = [
(("_StringPool", "_StringData"), ".msi", "Windows Installer Package", "application/x-msi"),
]

# Root directory entry CLSIDs that identify specific formats.
# CLSIDs are stored in mixed-endian format in CFBF files.
# Each entry: (clsid_bytes, extension, name, mime_type)
_CLSID_MATCHES = [
# Microsoft Project 98/2000/2002/2003: {74b78f3a-c8c8-11d1-be11-00c04fb6faf1}
(
b"\x3a\x8f\xb7\x74\xc8\xc8\xd1\x11\xbe\x11\x00\xc0\x4f\xb6\xfa\xf1",
".mpp",
"Microsoft Project",
"application/vnd.ms-project",
),
# Microsoft Project 4.x: {72fd3320-9a05-11cf-85a4-00a0c904de5f}
(
b"\x20\x33\xfd\x72\x05\x9a\xcf\x11\x85\xa4\x00\xa0\xc9\x04\xde\x5f",
".mpp",
"Microsoft Project",
"application/vnd.ms-project",
),
]


def _extract_stream_names(dir_data: bytes) -> set[str]:
"""Parse CFBF directory entries and return the set of stream/storage names."""
Expand All @@ -45,8 +71,19 @@ def _extract_stream_names(dir_data: bytes) -> set[str]:
return names


def _identify_format(stream_names: set[str]) -> Match | None:
"""Match stream names against known CFBF format signatures."""
def _extract_root_clsid(dir_data: bytes) -> bytes | None:
"""Extract the CLSID from the root directory entry (obj_type 5)."""
for i in range(0, len(dir_data), 128):
entry = dir_data[i : i + 128]
if len(entry) < 96:
break
if entry[66] == 5: # Root storage
return entry[80:96]
return None


def _identify_format(stream_names: set[str], dir_data: bytes) -> Match | None:
"""Match stream names and CLSIDs against known CFBF format signatures."""
# Check prefix matches first (e.g. __substg1.0_ for MSG)
for name in stream_names:
for prefix, ext, fmt_name, mime in _PREFIX_MATCHES:
Expand All @@ -58,6 +95,18 @@ def _identify_format(stream_names: set[str]) -> Match | None:
if stream_name in stream_names:
return Match(ext, fmt_name, mime)

# Check multi-stream matches (all required streams must be present)
for required_streams, ext, fmt_name, mime in _MULTI_STREAM_MATCHES:
if all(s in stream_names for s in required_streams):
return Match(ext, fmt_name, mime)

# Check root CLSID
root_clsid = _extract_root_clsid(dir_data)
if root_clsid:
for clsid, ext, fmt_name, mime in _CLSID_MATCHES:
if root_clsid == clsid:
return Match(ext, fmt_name, mime)

return None


Expand Down Expand Up @@ -94,4 +143,4 @@ def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None:
return None

stream_names = _extract_stream_names(dir_data)
return _identify_format(stream_names)
return _identify_format(stream_names, dir_data)
21 changes: 21 additions & 0 deletions puremagic/scanners/ebml_scanner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os

from puremagic.scanners.helpers import Match

match_bytes = b"\x1a\x45\xdf\xa3"


def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None:
if not head or len(head) < 8:
return None
if head[:4] != match_bytes:
return None

# Search for DocType string in the EBML header (first 64 bytes)
search_area = head[:64]
if b"webm" in search_area:
return Match(".webm", "WebM Video", "video/webm")
if b"matroska" in search_area:
return Match(".mkv", "Matroska Video", "video/x-matroska")

return None
2 changes: 1 addition & 1 deletion puremagic/scanners/hdf5_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
"application/x-biom2",
),
# mz5 - mass spectrometry
([], [b"/SpectrumMetaData", b"/ChomatogramMetaData"], 1, ".mz5", "mz5 mass spectrometry data", "application/x-mz5"),
([], [b"/SpectrumMetaData", b"/ChromatogramList"], 1, ".mz5", "mz5 mass spectrometry data", "application/x-mz5"),
# h5mlm - ML model
([], [b"model_type", b"h5mlm"], 1, ".h5mlm", "HDF5 ML model", "application/x-h5mlm"),
]
Expand Down
39 changes: 39 additions & 0 deletions puremagic/scanners/ogg_scanner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import os

from puremagic.scanners.helpers import Match

match_bytes = b"OggS"

# Ogg codec identification signatures found at the start of the first page payload.
# Each entry: (codec_id_bytes, extension, name, mime_type)
_OGG_CODEC_MAP = [
(b"\x01vorbis", ".ogg", "Ogg Vorbis Audio", "audio/ogg"),
(b"OpusHead", ".opus", "Ogg Opus Audio", "audio/ogg"),
(b"\x80theora", ".ogv", "Ogg Theora Video", "video/ogg"),
(b"\x7fFLAC", ".oga", "Ogg FLAC Audio", "audio/ogg"),
(b"Speex ", ".spx", "Ogg Speex Audio", "audio/ogg"),
(b"fishead\x00", ".ogv", "Ogg Annodex", "video/ogg"),
(b"\x01video", ".ogm", "OGM Video", "video/x-ogm+ogg"),
]


def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None:
if not head or len(head) < 28:
return None

# Verify OggS capture pattern, version 0, and beginning-of-stream flag
if head[:4] != match_bytes or head[4] != 0 or not (head[5] & 0x02):
return None

seg_count = head[26]
payload_start = 27 + seg_count

if payload_start >= len(head):
return None

payload = head[payload_start:]
for codec_id, ext, name, mime in _OGG_CODEC_MAP:
if payload.startswith(codec_id):
return Match(ext, name, mime, confidence=0.9)

return None
4 changes: 4 additions & 0 deletions puremagic/scanners/text_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,10 @@ def main(file_path: os.PathLike | str, _, __) -> Match | None:
if len(head) < 8:
return Match("", "very short file", "application/octet-stream", confidence=0.5)

# NUL bytes indicate binary data, but skip this check for UTF-16 (which has NUL bytes naturally)
if b"\x00" in head and head[:2] not in (b"\xff\xfe", b"\xfe\xff"):
return Match("", "data", "application/octet-stream", confidence=0.5)

try:
text, encoding = decode_any(head)
except TypeError:
Expand Down
Loading
Loading