Write

cozip::create takes an arrow::Table with name and path columns. Anything else rides along as user-defined manifest columns.

tmp <- tempfile()
dir.create(tmp)

paths <- character()
for (i in 0:2) {
  p <- file.path(tmp, sprintf("file_%04d.bin", i))
  writeBin(rep(charToRaw("hello cozip\n"), 1000), p)
  paths <- c(paths, p)
}

tbl <- arrow::arrow_table(
  name = basename(paths),
  path = paths
)

cozip::create("dataset.zip", tbl)

Read

cozip::read works on a local path or a remote URL with the same call. You get a data.frame plus an injected cozip:gdal_vsi column ready for terra or sf.

# local archive
df <- cozip::read("dataset.zip")

# or remote, no full download, two range requests under the hood
df <- cozip::read(
  paste0(
    "https://huggingface.co/datasets/Major-TOM/Core-VIIRS-Nighttime-Light/",
    "resolve/main/2024/MAJORTOM-VIIRS-NTL_2024_median_000.zip"
  )
)

# hand the cozip:gdal_vsi path straight to terra
r <- terra::rast(df[["cozip:gdal_vsi"]][1])

Query with DuckDB

The cozip community extension reads the same archive over SQL. Pair it with the duckdb R package, or run it straight from the DuckDB CLI.

-- one-time install
INSTALL cozip FROM community;
LOAD cozip;

-- hello world, first 10 entries of the manifest
SELECT *
FROM read_cozip('https://huggingface.co/datasets/Major-TOM/Core-VIIRS-Nighttime-Light/resolve/main/2024/MAJORTOM-VIIRS-NTL_2024_median_000.zip')
LIMIT 10;

-- raw manifest, without the injected /vsisubfile/ column
SELECT *
FROM read_cozip(
    'https://huggingface.co/datasets/Major-TOM/Core-VIIRS-Nighttime-Light/resolve/main/2024/MAJORTOM-VIIRS-NTL_2024_median_000.zip',
    gdal_vsi := false
)
LIMIT 10;

-- filter the manifest, keep the /vsisubfile/ paths for the biggest tifs
SELECT name, "cozip:gdal_vsi", size
FROM read_cozip('https://huggingface.co/datasets/Major-TOM/Core-VIIRS-Nighttime-Light/resolve/main/2024/MAJORTOM-VIIRS-NTL_2024_median_000.zip')
WHERE name LIKE '%.tif'
ORDER BY size DESC
LIMIT 5;