import pystac_client
import planetary_computer
import duckdb
import adlfs
Accessing GBIF Parquet data on Azure and AWS clouds using duckdb
Azure
Using Planetary Computer, we can query STAC to access GBIF snapshots on Azure
= pystac_client.Client.open(
catalog "https://planetarycomputer.microsoft.com/api/stac/v1",
=planetary_computer.sign_inplace,
modifier
)= catalog.search(collections=["gbif"])
search = search.get_all_items()
items = {x.id: x for x in items}
items
# most recent snapshot (first item)
= list(items.values())[0] item
= item.assets["data"].extra_fields["table:storage_options"]
keys = keys["account_name"]
AZURE_STORAGE_ACCOUNT_NAME = keys["credential"] AZURE_STORAGE_ACCOUNT_KEY
= adlfs.AzureBlobFileSystem(account_name=AZURE_STORAGE_ACCOUNT_NAME, account_key=AZURE_STORAGE_ACCOUNT_KEY )
fs = duckdb.connect()
con con.register_filesystem(fs)
# test with trivial query
= con.execute(f'''
df SELECT *
FROM read_parquet("abfs://gbif/occurrence/2023-02-01/occurrence.parquet/*")
LIMIT 1
'''
).df()
df
Count vertebrate species by class in each .1 degree pixel
= f'''
query SELECT class, longitude, latitude, COALESCE(n, 0.0) AS n
FROM (
SELECT class, longitude, latitude, COUNT(*) AS n
FROM (
SELECT DISTINCT
class,
genus,
ROUND(decimallongitude,1) AS longitude,
ROUND(decimallatitude, 1) AS latitude
FROM read_parquet("abfs://gbif/occurrence/2023-02-01/occurrence.parquet/*")
WHERE (phylum = 'Chordata')
)
GROUP BY class, longitude, latitude
)
'''
= con.execute(query).df() df
NameError: name 'con' is not defined
import geopandas as gpd
from geocube.api.core import make_geocube
from geocube.rasterize import rasterize_points_griddata, rasterize_points_radial
= gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs='epsg:4326')
gdf = make_geocube(
geo_grid =gdf,
vector_data=['n'],
measurements=.1,
resolution=rasterize_points_griddata,
rasterize_function )
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
=dict(projection=ccrs.Robinson(),
subplot_kws='grey')
facecolor
=[12,8])
plt.figure(figsize= geo_grid.n.plot(cmap='viridis',
p =subplot_kws,
subplot_kws=ccrs.PlateCarree(),
transform=False,
add_labels=False)
add_colorbar
='grey', alpha=0.5, linestyle='--')
p.axes.gridlines(color-19, 35, 30, 66], crs=ccrs.PlateCarree()) p.axes.set_extent([