cozip · Python

Write

Drop three files in a temp directory, build a pa.Table with name, path, and any extras you want to ride along (here split, label, and a GeoParquet geometry column), hand it to cozip.write.

import json
import tempfile
from pathlib import Path

import pyarrow as pa
import shapely
from shapely.geometry import box

import cozip

# three tmp files with anything inside
tmp = Path(tempfile.mkdtemp())
paths = []
for i in range(3):
    p = tmp / f"file_{i}.bin"
    p.write_bytes(f"file {i} contents\n".encode() * 100)
    paths.append(p)

# GeoParquet metadata so viewers recognize the geometry column
geo = {
    "version": "1.1.0",
    "primary_column": "geometry",
    "columns": {"geometry": {"encoding": "WKB", "geometry_types": ["Polygon"]}},
}

table = pa.table({
    "name":     [p.name for p in paths],
    "path":     [str(p) for p in paths],
    "split":    ["train", "val", "train"],
    "label":    ["zeros", "ones", "twos"],
    "geometry": [
        shapely.to_wkb(box(-77.0, -12.1, -76.9, -12.0)),
        shapely.to_wkb(box(-76.9, -12.1, -76.8, -12.0)),
        shapely.to_wkb(box(-76.8, -12.1, -76.7, -12.0)),
    ],
}).replace_schema_metadata({"geo": json.dumps(geo)})

archive = str(tmp / "dataset.zip")
cozip.write(archive, table)

Read

cozip.read returns the manifest as a pandas DataFrame, your custom columns included. Filter however you like, then open the matching files in place with seek and read using the offset and size the writer added.

import cozip

df = cozip.read(archive)
print(df)

# filter on your own columns, then read the matching files in place
trains = df[df["split"] == "train"]
with open(archive, "rb") as f:
    for _, row in trains.iterrows():
        f.seek(int(row["offset"]))
        data = f.read(int(row["size"]))
        print(row["name"], len(data), "bytes")

Publish

The archive is a plain ZIP so any S3-compatible bucket works. The example below uses asterisk-labs/cozip on Source Cooperative.

import boto3
import cozip

# Source Cooperative is a regular S3 bucket. Swap bucket and creds for HuggingFace, R2, MinIO, etc.
s3 = boto3.client(
    "s3",
    region_name="us-west-2",
    aws_access_key_id="<your-key>",
    aws_secret_access_key="<your-secret>",
    aws_session_token="<your-session-token>",  # Source Coop creds are temporary STS
)
s3.upload_file(
    archive,
    "us-west-2.opendata.source.coop",
    "asterisk-labs/cozip/dataset.zip",
)

# same read flow, just a URL instead of a path
url = "https://data.source.coop/asterisk-labs/cozip/dataset.zip"
print(cozip.read(url))

Explore in the playground

Once the archive is public the cozip playground reads it from the browser. No Python, no install, no download. The manifest renders as a table, and any file is one click from a copyable URL.

Open the playground and paste the URL below into the input.

https://data.source.coop/asterisk-labs/cozip/dataset.zip