correct daily snapshot: flatten stops, add fetch date

This commit is contained in:
Piotr Oleszczyk 2025-05-12 14:46:33 +02:00
parent 6891e3b206
commit 96f918445c
2 changed files with 47 additions and 5 deletions

View file

@ -78,21 +78,47 @@ def write_course_posts(table: pa.Table, course_id: int, ts: datetime) -> str:
def write_stops(table: pa.Table, snapshot_dt: datetime) -> str:
"""
Write the daily snapshot of stops (getPosts).
Take a flat table with columns
fetch_date, group_name, group_id, group_type,
stop_id, stop_type, x, y
and convert x/y a WKB Point geometry,
drop the raw coords, and write GeoParquet.
"""
# 1) pull x,y into numpy
xs = table.column("x").to_numpy()
ys = table.column("y").to_numpy()
# 2) build PointType with CRS
pt_builder = ga.point().with_crs("EPSG:4326")
# 3) build WKB geometry
geom = ga.as_wkb(pt_builder.from_geobuffers(None, xs, ys))
# 4) drop coords & append geometry column
table = table.drop(["x", "y"]).append_column("geometry", geom)
# 5) write out
subdir = snapshot_dt.strftime("stops/%Y-%m-%d")
local_dir = os.path.join(LOCAL_DATA_DIR, subdir)
_ensure_dir(local_dir)
local_path = os.path.join(local_dir, f"stops_{HOST_ID}.parquet")
pq.write_table(table, local_path, compression="snappy")
return local_path
filename = f"stops_{HOST_ID}.parquet"
out_path = os.path.join(local_dir, filename)
pq.write_table(table, out_path, compression="snappy")
return out_path
def write_posts_lines(table: pa.Table, snapshot_dt: datetime) -> str:
"""
Write the daily stoplines mapping (getPostsLines).
"""
# ensure every row gets a fetch_date
if "fetch_date" not in table.column_names:
table = table.append_column(
"fetch_date",
pa.array([snapshot_dt] * table.num_rows, type=pa.timestamp("ns", tz="UTC")),
)
subdir = snapshot_dt.strftime("stops_lines/%Y-%m-%d")
local_dir = os.path.join(LOCAL_DATA_DIR, subdir)
_ensure_dir(local_dir)