initial commit

This commit is contained in:
Piotr Oleszczyk 2025-05-10 17:41:31 +02:00
commit c6d1d51b00
11 changed files with 1214 additions and 0 deletions

246
src/ptscrapper/storage.py Normal file
View file

@ -0,0 +1,246 @@
# src/ptscraper/storage.py
import glob
import os
from datetime import date, datetime
import geoarrow.pyarrow as ga
import polyline
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
from rclone_python import rclone
from .config import HOST_ID # unique identifier for this host/container
from .config import LOCAL_DATA_DIR # base local folder, e.g. "/data/pt-scraper"
from .config import RCLONE_REMOTE # name of your rclone remote, e.g. "nas"
from .config import ( # base path on the remote, e.g. "pt-scraper-data"
RCLONE_REMOTE_PATH,
)
def _ensure_dir(path: str):
"""Make sure the directory exists."""
os.makedirs(path, exist_ok=True)
def write_positions(table: pa.Table, ts: datetime) -> str:
"""
Convert x/y to a Point geometry, drop the originals,
and write out a GeoParquet.
"""
# 1) Extract x/y as numpy arrays (zero_copy_only=False to ensure a NumPy copy)
xs = table.column("x").to_numpy()
ys = table.column("y").to_numpy()
# 2) Build a PointType with the correct CRS
builder = ga.point().with_crs("EPSG:4326")
# 3) Create the geometry array directly from the Arrow buffers
geom = ga.as_wkb(
builder.from_geobuffers(None, xs, ys)
) # ensure the geometry is in WKB format
# 4) drop old coords & append 'geometry'
table = table.drop(["x", "y"]).append_column("geometry", geom)
subdir = ts.strftime("positions/%Y/%m/%d/%H")
local_dir = os.path.join(LOCAL_DATA_DIR, subdir)
_ensure_dir(local_dir)
filename = f"{ts.strftime('%M%S')}_{HOST_ID}.parquet"
local_path = os.path.join(local_dir, filename)
pq.write_table(table, local_path, compression="snappy")
return local_path
def write_course_posts(table: pa.Table, course_id: int, ts: datetime) -> str:
"""
Write course-posts data for a single course to Parquet.
"""
subdir = ts.strftime("courses/%Y-%m-%d")
local_dir = os.path.join(LOCAL_DATA_DIR, subdir)
_ensure_dir(local_dir)
filename = f"course_{course_id}_{ts.strftime('%H%M%S')}_{HOST_ID}.parquet"
local_path = os.path.join(local_dir, filename)
pq.write_table(table, local_path, compression="snappy")
return local_path
def write_stops(table: pa.Table, snapshot_dt: datetime) -> str:
"""
Write the daily snapshot of stops (getPosts).
"""
subdir = snapshot_dt.strftime("stops/%Y-%m-%d")
local_dir = os.path.join(LOCAL_DATA_DIR, subdir)
_ensure_dir(local_dir)
local_path = os.path.join(local_dir, f"stops_{HOST_ID}.parquet")
pq.write_table(table, local_path, compression="snappy")
return local_path
def write_posts_lines(table: pa.Table, snapshot_dt: datetime) -> str:
"""
Write the daily stoplines mapping (getPostsLines).
"""
subdir = snapshot_dt.strftime("stops_lines/%Y-%m-%d")
local_dir = os.path.join(LOCAL_DATA_DIR, subdir)
_ensure_dir(local_dir)
local_path = os.path.join(local_dir, f"posts_lines_{HOST_ID}.parquet")
pq.write_table(table, local_path, compression="snappy")
return local_path
def push_to_nas(local_path: str):
"""
Push a local file to the configured rclone remote,
preserving its relative path under RCLONE_REMOTE_PATH.
"""
# Compute the path on the remote
rel_path = os.path.relpath(local_path, LOCAL_DATA_DIR).replace(os.sep, "/")
remote_uri = f"{RCLONE_REMOTE}:{RCLONE_REMOTE_PATH}/{rel_path}"
# Copy, skipping files that already exist remotely
rclone.copy(local_path, remote_uri, ignore_existing=True)
def compact_positions(date_to_compact: date) -> str:
"""
Read all small Parquet under positions/YYYY/MM/DD,
merge into one table, write a single daily file.
Returns the path to that daily Parquet.
"""
# build glob pattern for that day
day_glob = os.path.join(
LOCAL_DATA_DIR, f"positions/{date_to_compact:%Y/%m/%d}/**/*.parquet"
)
# load dataset
dataset = ds.dataset(day_glob, format="parquet", partitioning="hive")
table = dataset.to_table() # merges all batches
# write one daily file
out_dir = os.path.join(LOCAL_DATA_DIR, "positions_compacted")
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, f"{date_to_compact.isoformat()}_{HOST_ID}.parquet")
pq.write_table(table, out_path, compression="snappy")
return out_path
def cleanup_small_positions(date_to_remove: date):
"""
Optionally delete the raw minute files once compacted.
"""
base = os.path.join(LOCAL_DATA_DIR, f"positions/{date_to_remove:%Y/%m/%d}")
for path in glob.glob(f"{base}/**/*.parquet", recursive=True):
try:
os.remove(path)
except OSError:
pass
def compact_course_posts(date_to_compact: date) -> str:
"""
Read all shards under courses/YYYY-MM-DD/*.parquet,
merge into one table, write a single daily file.
"""
pattern = os.path.join(
LOCAL_DATA_DIR, f"courses/{date_to_compact:%Y-%m-%d}/*.parquet"
)
dataset = ds.dataset(pattern, format="parquet")
table = dataset.to_table()
out_dir = os.path.join(LOCAL_DATA_DIR, "courses_compacted")
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, f"{date_to_compact.isoformat()}_{HOST_ID}.parquet")
pq.write_table(table, out_path, compression="snappy")
return out_path
def cleanup_small_course_posts(date_to_remove: date):
"""
Delete the original course shards once compacted.
"""
base = os.path.join(LOCAL_DATA_DIR, f"courses/{date_to_remove:%Y-%m-%d}")
for fn in glob.glob(f"{base}/*.parquet"):
try:
os.remove(fn)
except OSError:
pass
def write_course_geometry(polyline_str: str, course_id: int, ts: datetime) -> str:
"""
Decode the Google polyline into a GeoArrow LineString WKB,
attach EPSG:4326, and write out a GeoParquet.
"""
# 1) Decode → list of (lon, lat)
coords = polyline.decode(polyline_str, geojson=True)
# 2) Build a WKT LINESTRING
pts = ", ".join(f"{x} {y}" for x, y in coords)
wkt = f"LINESTRING({pts})"
# 3) Convert that WKT to a WKB ExtensionArray
# as_wkb() will parse each WKT string into WKB
wkb_arr = ga.as_wkb([wkt])
# 4) Cast into a WkbType with EPSG:4326 built in
wkb_with_crs = wkb_arr.cast(
ga.wkb().with_crs("EPSG:4326")
)
# 5) Build a 1-row table
table = pa.Table.from_arrays(
[
pa.array([ts]),
pa.array([course_id]),
wkb_with_crs
],
names=["fetch_date", "course_id", "geometry"],
)
# 6) Write out the GeoParquet
subdir = ts.strftime("courses_geometry/%Y-%m-%d")
out_dir = os.path.join(LOCAL_DATA_DIR, subdir)
os.makedirs(out_dir, exist_ok=True)
fn = f"shape_{course_id}_{ts.strftime('%H%M%S')}.parquet"
out_path = os.path.join(out_dir, fn)
pq.write_table(table, out_path, compression="snappy")
return out_path
def compact_course_geometry(date_to_compact: date) -> str:
"""
Read all geometry shards under courses_geometry/YYYY-MM-DD/*.parquet,
merge into one table, write a single daily file.
Returns the path to that daily Parquet.
"""
pattern = os.path.join(
LOCAL_DATA_DIR, f"courses_geometry/{date_to_compact:%Y-%m-%d}/*.parquet"
)
dataset = ds.dataset(pattern, format="parquet")
table = dataset.to_table()
out_dir = os.path.join(LOCAL_DATA_DIR, "courses_geometry_compacted")
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, f"{date_to_compact.isoformat()}_{HOST_ID}.parquet")
pq.write_table(table, out_path, compression="snappy")
return out_path
def cleanup_small_course_geometry(date_to_remove: date):
"""
Delete the original course-geometry shards once compacted.
"""
base = os.path.join(LOCAL_DATA_DIR, f"courses_geometry/{date_to_remove:%Y-%m-%d}")
for fn in glob.glob(f"{base}/*.parquet"):
try:
os.remove(fn)
except OSError:
pass