cozip · R

Write

Drop three files in a temp directory, build an arrow::arrow_table with name, path, and any extras you want to ride along (here split, label, and a GeoParquet geometry column), hand it to cozip::write.

library(arrow)
library(jsonlite)
library(sf)
library(cozip)

# three tmp files with anything inside
tmp <- tempfile()
dir.create(tmp)
paths <- vapply(0:2, function(i) {
  p <- file.path(tmp, sprintf("file_%d.bin", i))
  writeBin(rep(charToRaw(sprintf("file %d contents\n", i)), 100), p)
  p
}, character(1))

# GeoParquet metadata so viewers recognize the geometry column
geo <- list(
  version        = "1.1.0",
  primary_column = "geometry",
  columns        = list(geometry = list(encoding = "WKB", geometry_types = list("Polygon")))
)

# bounding box polygon as WKB
bbox_wkb <- function(xmin, ymin, xmax, ymax) {
  poly <- st_polygon(list(rbind(
    c(xmin, ymin), c(xmax, ymin), c(xmax, ymax), c(xmin, ymax), c(xmin, ymin)
  )))
  st_as_binary(st_sfc(poly))[[1]]
}

tbl <- arrow_table(
  name     = basename(paths),
  path     = paths,
  split    = c("train", "val", "train"),
  label    = c("zeros", "ones", "twos"),
  geometry = list(
    bbox_wkb(-77.0, -12.1, -76.9, -12.0),
    bbox_wkb(-76.9, -12.1, -76.8, -12.0),
    bbox_wkb(-76.8, -12.1, -76.7, -12.0)
  )
)
tbl$metadata <- list(geo = toJSON(geo, auto_unbox = TRUE))

archive <- file.path(tmp, "dataset.zip")
cozip::write(archive, tbl)

Read

cozip::read returns the manifest as a tibble, your custom columns included. Filter however you like, then open the matching files in place with seek and readBin using the offset and size the writer added.

library(cozip)

df <- cozip::read(archive)
print(df)

# filter on your own columns, then read the matching files in place
trains <- df[df$split == "train", ]
con <- file(archive, "rb")
for (i in seq_len(nrow(trains))) {
  seek(con, where = trains$offset[i])
  data <- readBin(con, what = "raw", n = trains$size[i])
  cat(trains$name[i], length(data), "bytes\n")
}
close(con)

Publish

The archive is a plain ZIP so any S3-compatible bucket works. The example below uses asterisk-labs/cozip on Source Cooperative.

library(aws.s3)
library(cozip)

# Source Cooperative is a regular S3 bucket. Swap bucket and creds for HuggingFace, R2, MinIO, etc.
Sys.setenv(
  AWS_ACCESS_KEY_ID     = "<your-key>",
  AWS_SECRET_ACCESS_KEY = "<your-secret>",
  AWS_SESSION_TOKEN     = "<your-session-token>",  # Source Coop creds are temporary STS
  AWS_DEFAULT_REGION    = "us-west-2"
)

put_object(
  file   = archive,
  object = "asterisk-labs/cozip/dataset.zip",
  bucket = "us-west-2.opendata.source.coop"
)

# same read flow, just a URL instead of a path
url <- "https://data.source.coop/asterisk-labs/cozip/dataset.zip"
print(cozip::read(url))

Explore in the playground

Once the archive is public the cozip playground reads it from the browser. No R, no install, no download. The manifest renders as a table, and any file is one click from a copyable URL.

Open the playground and paste the URL below into the input.

https://data.source.coop/asterisk-labs/cozip/dataset.zip