initial commit

2025-05-10 17:41:31 +02:00 · 2025-05-10 17:41:31 +02:00 · c6d1d51b00
commit c6d1d51b00
11 changed files with 1214 additions and 0 deletions
--- a/src/ptscrapper/storage.py
+++ b/src/ptscrapper/storage.py
@ -0,0 +1,246 @@
+# src/ptscraper/storage.py
+
+import glob
+import os
+from datetime import date, datetime
+
+import geoarrow.pyarrow as ga
+import polyline
+import pyarrow as pa
+import pyarrow.dataset as ds
+import pyarrow.parquet as pq
+from rclone_python import rclone
+
+from .config import HOST_ID  # unique identifier for this host/container
+from .config import LOCAL_DATA_DIR  # base local folder, e.g. "/data/pt-scraper"
+from .config import RCLONE_REMOTE  # name of your rclone remote, e.g. "nas"
+from .config import (  # base path on the remote, e.g. "pt-scraper-data"
+    RCLONE_REMOTE_PATH,
+)
+
+
+def _ensure_dir(path: str):
+    """Make sure the directory exists."""
+    os.makedirs(path, exist_ok=True)
+
+
+def write_positions(table: pa.Table, ts: datetime) -> str:
+    """
+    Convert x/y to a Point geometry, drop the originals,
+    and write out a GeoParquet.
+    """
+    # 1) Extract x/y as numpy arrays (zero_copy_only=False to ensure a NumPy copy)
+    xs = table.column("x").to_numpy()
+    ys = table.column("y").to_numpy()
+
+    # 2) Build a PointType with the correct CRS
+    builder = ga.point().with_crs("EPSG:4326")
+
+    # 3) Create the geometry array directly from the Arrow buffers
+    geom = ga.as_wkb(
+        builder.from_geobuffers(None, xs, ys)
+    )  # ensure the geometry is in WKB format
+
+    # 4) drop old coords & append 'geometry'
+    table = table.drop(["x", "y"]).append_column("geometry", geom)
+
+    subdir = ts.strftime("positions/%Y/%m/%d/%H")
+    local_dir = os.path.join(LOCAL_DATA_DIR, subdir)
+    _ensure_dir(local_dir)
+
+    filename = f"{ts.strftime('%M%S')}_{HOST_ID}.parquet"
+    local_path = os.path.join(local_dir, filename)
+    pq.write_table(table, local_path, compression="snappy")
+
+    return local_path
+
+
+def write_course_posts(table: pa.Table, course_id: int, ts: datetime) -> str:
+    """
+    Write course-posts data for a single course to Parquet.
+    """
+    subdir = ts.strftime("courses/%Y-%m-%d")
+    local_dir = os.path.join(LOCAL_DATA_DIR, subdir)
+    _ensure_dir(local_dir)
+
+    filename = f"course_{course_id}_{ts.strftime('%H%M%S')}_{HOST_ID}.parquet"
+    local_path = os.path.join(local_dir, filename)
+
+    pq.write_table(table, local_path, compression="snappy")
+    return local_path
+
+
+def write_stops(table: pa.Table, snapshot_dt: datetime) -> str:
+    """
+    Write the daily snapshot of stops (getPosts).
+    """
+    subdir = snapshot_dt.strftime("stops/%Y-%m-%d")
+    local_dir = os.path.join(LOCAL_DATA_DIR, subdir)
+    _ensure_dir(local_dir)
+
+    local_path = os.path.join(local_dir, f"stops_{HOST_ID}.parquet")
+    pq.write_table(table, local_path, compression="snappy")
+    return local_path
+
+
+def write_posts_lines(table: pa.Table, snapshot_dt: datetime) -> str:
+    """
+    Write the daily stop→lines mapping (getPostsLines).
+    """
+    subdir = snapshot_dt.strftime("stops_lines/%Y-%m-%d")
+    local_dir = os.path.join(LOCAL_DATA_DIR, subdir)
+    _ensure_dir(local_dir)
+
+    local_path = os.path.join(local_dir, f"posts_lines_{HOST_ID}.parquet")
+    pq.write_table(table, local_path, compression="snappy")
+    return local_path
+
+
+def push_to_nas(local_path: str):
+    """
+    Push a local file to the configured rclone remote,
+    preserving its relative path under RCLONE_REMOTE_PATH.
+    """
+    # Compute the path on the remote
+    rel_path = os.path.relpath(local_path, LOCAL_DATA_DIR).replace(os.sep, "/")
+    remote_uri = f"{RCLONE_REMOTE}:{RCLONE_REMOTE_PATH}/{rel_path}"
+
+    # Copy, skipping files that already exist remotely
+    rclone.copy(local_path, remote_uri, ignore_existing=True)
+
+
+def compact_positions(date_to_compact: date) -> str:
+    """
+    Read all small Parquet under positions/YYYY/MM/DD,
+    merge into one table, write a single daily file.
+    Returns the path to that daily Parquet.
+    """
+    # build glob pattern for that day
+    day_glob = os.path.join(
+        LOCAL_DATA_DIR, f"positions/{date_to_compact:%Y/%m/%d}/**/*.parquet"
+    )
+    # load dataset
+    dataset = ds.dataset(day_glob, format="parquet", partitioning="hive")
+    table = dataset.to_table()  # merges all batches
+
+    # write one daily file
+    out_dir = os.path.join(LOCAL_DATA_DIR, "positions_compacted")
+    os.makedirs(out_dir, exist_ok=True)
+    out_path = os.path.join(out_dir, f"{date_to_compact.isoformat()}_{HOST_ID}.parquet")
+    pq.write_table(table, out_path, compression="snappy")
+    return out_path
+
+
+def cleanup_small_positions(date_to_remove: date):
+    """
+    Optionally delete the raw minute files once compacted.
+    """
+    base = os.path.join(LOCAL_DATA_DIR, f"positions/{date_to_remove:%Y/%m/%d}")
+    for path in glob.glob(f"{base}/**/*.parquet", recursive=True):
+        try:
+            os.remove(path)
+        except OSError:
+            pass
+
+
+def compact_course_posts(date_to_compact: date) -> str:
+    """
+    Read all shards under courses/YYYY-MM-DD/*.parquet,
+    merge into one table, write a single daily file.
+    """
+    pattern = os.path.join(
+        LOCAL_DATA_DIR, f"courses/{date_to_compact:%Y-%m-%d}/*.parquet"
+    )
+    dataset = ds.dataset(pattern, format="parquet")
+    table = dataset.to_table()
+
+    out_dir = os.path.join(LOCAL_DATA_DIR, "courses_compacted")
+    os.makedirs(out_dir, exist_ok=True)
+    out_path = os.path.join(out_dir, f"{date_to_compact.isoformat()}_{HOST_ID}.parquet")
+    pq.write_table(table, out_path, compression="snappy")
+    return out_path
+
+
+def cleanup_small_course_posts(date_to_remove: date):
+    """
+    Delete the original course shards once compacted.
+    """
+    base = os.path.join(LOCAL_DATA_DIR, f"courses/{date_to_remove:%Y-%m-%d}")
+    for fn in glob.glob(f"{base}/*.parquet"):
+        try:
+            os.remove(fn)
+        except OSError:
+            pass
+
+
+def write_course_geometry(polyline_str: str, course_id: int, ts: datetime) -> str:
+    """
+    Decode the Google polyline into a GeoArrow LineString WKB,
+    attach EPSG:4326, and write out a GeoParquet.
+    """
+    # 1) Decode → list of (lon, lat)
+    coords = polyline.decode(polyline_str, geojson=True)
+
+    # 2) Build a WKT LINESTRING
+    pts = ", ".join(f"{x} {y}" for x, y in coords)
+    wkt = f"LINESTRING({pts})"
+
+    # 3) Convert that WKT to a WKB ExtensionArray
+    #    as_wkb() will parse each WKT string into WKB
+    wkb_arr = ga.as_wkb([wkt])
+
+    # 4) Cast into a WkbType with EPSG:4326 built in
+    wkb_with_crs = wkb_arr.cast(
+        ga.wkb().with_crs("EPSG:4326")
+    )
+
+    # 5) Build a 1-row table
+    table = pa.Table.from_arrays(
+        [
+            pa.array([ts]), 
+            pa.array([course_id]), 
+            wkb_with_crs
+        ],
+        names=["fetch_date", "course_id", "geometry"],
+    )
+
+    # 6) Write out the GeoParquet
+    subdir = ts.strftime("courses_geometry/%Y-%m-%d")
+    out_dir = os.path.join(LOCAL_DATA_DIR, subdir)
+    os.makedirs(out_dir, exist_ok=True)
+
+    fn = f"shape_{course_id}_{ts.strftime('%H%M%S')}.parquet"
+    out_path = os.path.join(out_dir, fn)
+    pq.write_table(table, out_path, compression="snappy")
+    return out_path
+
+
+def compact_course_geometry(date_to_compact: date) -> str:
+    """
+    Read all geometry shards under courses_geometry/YYYY-MM-DD/*.parquet,
+    merge into one table, write a single daily file.
+    Returns the path to that daily Parquet.
+    """
+    pattern = os.path.join(
+        LOCAL_DATA_DIR, f"courses_geometry/{date_to_compact:%Y-%m-%d}/*.parquet"
+    )
+    dataset = ds.dataset(pattern, format="parquet")
+    table = dataset.to_table()
+
+    out_dir = os.path.join(LOCAL_DATA_DIR, "courses_geometry_compacted")
+    os.makedirs(out_dir, exist_ok=True)
+    out_path = os.path.join(out_dir, f"{date_to_compact.isoformat()}_{HOST_ID}.parquet")
+    pq.write_table(table, out_path, compression="snappy")
+    return out_path
+
+
+def cleanup_small_course_geometry(date_to_remove: date):
+    """
+    Delete the original course-geometry shards once compacted.
+    """
+    base = os.path.join(LOCAL_DATA_DIR, f"courses_geometry/{date_to_remove:%Y-%m-%d}")
+    for fn in glob.glob(f"{base}/*.parquet"):
+        try:
+            os.remove(fn)
+        except OSError:
+            pass