Write

cozip.create takes an Arrow table with name and path columns. Anything else rides along as user-defined manifest columns.

import os
import tempfile
import pyarrow as pa
import cozip

tmp = tempfile.mkdtemp()
paths = []
for i in range(3):
    p = os.path.join(tmp, f"file_{i:04d}.bin")
    with open(p, "wb") as f:
        f.write(b"hello cozip\n" * 1000)
    paths.append(p)

table = pa.table({
    "name": [os.path.basename(p) for p in paths],
    "path": paths,
})

cozip.create("dataset.zip", table)

Read

cozip.read works on a local path or a remote URL with the same call. You get a pandas DataFrame plus an injected cozip:gdal_vsi column ready for rasterio or GDAL.

import cozip

# local archive
df = cozip.read("dataset.zip")

# or remote, no full download, two range requests under the hood
df = cozip.read(
    "https://huggingface.co/datasets/Major-TOM/Core-VIIRS-Nighttime-Light/"
    "resolve/main/2024/MAJORTOM-VIIRS-NTL_2024_median_000.zip"
)

# hand the cozip:gdal_vsi path straight to rasterio
import rasterio
with rasterio.open(df["cozip:gdal_vsi"].iloc[0]) as src:
    arr = src.read(1)

Query with DuckDB

The cozip community extension reads the same archive over SQL, on native and WebAssembly. One install, then any cozip URL works as a table.

-- one-time install
INSTALL cozip FROM community;
LOAD cozip;

-- hello world, first 10 entries of the manifest
SELECT *
FROM read_cozip('https://huggingface.co/datasets/Major-TOM/Core-VIIRS-Nighttime-Light/resolve/main/2024/MAJORTOM-VIIRS-NTL_2024_median_000.zip')
LIMIT 10;

-- raw manifest, without the injected /vsisubfile/ column
SELECT *
FROM read_cozip(
    'https://huggingface.co/datasets/Major-TOM/Core-VIIRS-Nighttime-Light/resolve/main/2024/MAJORTOM-VIIRS-NTL_2024_median_000.zip',
    gdal_vsi := false
)
LIMIT 10;

-- filter the manifest, keep the /vsisubfile/ paths for the biggest tifs
SELECT name, "cozip:gdal_vsi", size
FROM read_cozip('https://huggingface.co/datasets/Major-TOM/Core-VIIRS-Nighttime-Light/resolve/main/2024/MAJORTOM-VIIRS-NTL_2024_median_000.zip')
WHERE name LIKE '%.tif'
ORDER BY size DESC
LIMIT 5;