Geoparquet in R

Accessing an example geoparquet file in an S3 bucket and pulling data in only within a bounding box and for certain criteria
geoparquet
R
arrow
Author

Marc Weber

Published

December 19, 2025

Geoparquet extends the parquet file format and as nicely described in this blog post by Kyle Barron and provides a powerful new way to store and share geospatial data in a cloud-optimized format. I’ve been using it for more and more of my spatial data, and below is just a quick example using Overture Maps buildings data and doing spatial and attribute filtering to subset the data prior to reading in.

First open a connection to a cloud-hosted GeoParquet file

For this example we use Overture Maps buildings (public S3 bucket). We’ll open a connection (but we are not actually reading it in yet)

library(arrow)
library(dplyr)
library(sf)
library(sfarrow)

# Connect to Overture S3 (anonymous, us-west-2)
bucket <- s3_bucket("overturemaps-us-west-2", anonymous = TRUE, region = "us-west-2")
ds_path <- bucket$path("release/2025-12-17.0/theme=buildings/type=building")
buildings_ds <- open_dataset(ds_path, format = "parquet")
# Inspect available columns to confirm tile partitioning
print(buildings_ds$schema$names)  # look for "z", "x", "y" as partition columns
 [1] "id"                     "geometry"               "bbox"                  
 [4] "version"                "sources"                "level"                 
 [7] "subtype"                "class"                  "height"                
[10] "names"                  "has_parts"              "is_underground"        
[13] "num_floors"             "num_floors_underground" "min_height"            
[16] "min_floor"              "facade_color"           "facade_material"       
[19] "roof_material"          "roof_shape"             "roof_direction"        
[22] "roof_orientation"       "roof_color"             "roof_height"           
bbox_field <- buildings_ds$schema$GetFieldByName("bbox")

Look at a slice to see structure of data

collect(
  buildings_ds %>%
    select(id, bbox) %>%
    slice_head(n = 5)
)
# A tibble: 5 × 2
  id                                   bbox$xmin $xmax $ymin $ymax
  <chr>                                    <dbl> <dbl> <dbl> <dbl>
1 3444d8b4-e0e0-48ec-8e08-d31df0ea8585     -62.1 -62.1 -30.6 -30.6
2 62bc8afc-4764-4069-bc92-16489dddfaa6     -62.1 -62.1 -30.6 -30.6
3 337adaa6-6386-478e-a526-c080bc24ca75     -62.1 -62.1 -30.6 -30.6
4 6800a4d8-4143-433c-8765-19938b8be1d7     -62.1 -62.1 -30.6 -30.6
5 a0b1ebd3-0b48-474e-9025-69e18c6a22c3     -62.1 -62.1 -30.6 -30.6

Materialize a bbox as columns for inspection

# Get the Field and its type
bbox_field <- buildings_ds$schema$GetFieldByName("bbox")
print(bbox_field)
Field
bbox: struct<xmin: float, xmax: float, ymin: float, ymax: float>
print(bbox_field$type) 
StructType
struct<xmin: float, xmax: float, ymin: float, ymax: float>

Prune the data

We define your query bbox in EPSG:4326 lon/lat

xmin <- -123.32; ymin <- 44.52
xmax <- -123.20; ymax <- 44.63
bbox_sfc <- st_as_sfc(st_bbox(c(xmin = xmin, ymin = ymin, xmax = xmax, ymax = ymax), crs = 4326))


ds_pruned <- buildings_ds %>%
  filter(
    bbox$xmin <= xmax,      # feature xmin <= query xmax
    bbox$xmax >= xmin,      # feature xmax >= query xmin
    bbox$ymin <= ymax,      # feature ymin <= query ymax
    bbox$ymax >= ymin,      # feature ymax >= query ymin
    !is.na(height), height >= 4,
    !is.na(num_floors), num_floors >= 2
  ) %>%
  # Keep simple scalar columns plus geometry (avoid nested list/struct columns like 'names', 'sources', 'has_parts')
  select(id, geometry, height, num_floors, subtype, class)

Materialize and convert to sf

# Materialize and convert to sf
tbl <- collect(ds_pruned)
geom_sfc <- st_as_sfc(tbl$geometry, crs = 4326)
buildings_sf <- st_sf(tbl[, setdiff(names(tbl), "geometry")], geometry = geom_sfc)

# Optional: exact bbox cut in sf for precise geometry clipping
bbox_sfc <- st_as_sfc(st_bbox(c(xmin = xmin, ymin = ymin, xmax = xmax, ymax = ymax), crs = 4326))
buildings_bbox <- st_filter(buildings_sf, bbox_sfc)

mapview::mapview(buildings_bbox)