initial commit
This commit is contained in:
commit
c6d1d51b00
11 changed files with 1214 additions and 0 deletions
246
src/ptscrapper/storage.py
Normal file
246
src/ptscrapper/storage.py
Normal file
|
|
@ -0,0 +1,246 @@
|
|||
# src/ptscraper/storage.py
|
||||
|
||||
import glob
|
||||
import os
|
||||
from datetime import date, datetime
|
||||
|
||||
import geoarrow.pyarrow as ga
|
||||
import polyline
|
||||
import pyarrow as pa
|
||||
import pyarrow.dataset as ds
|
||||
import pyarrow.parquet as pq
|
||||
from rclone_python import rclone
|
||||
|
||||
from .config import HOST_ID # unique identifier for this host/container
|
||||
from .config import LOCAL_DATA_DIR # base local folder, e.g. "/data/pt-scraper"
|
||||
from .config import RCLONE_REMOTE # name of your rclone remote, e.g. "nas"
|
||||
from .config import ( # base path on the remote, e.g. "pt-scraper-data"
|
||||
RCLONE_REMOTE_PATH,
|
||||
)
|
||||
|
||||
|
||||
def _ensure_dir(path: str):
|
||||
"""Make sure the directory exists."""
|
||||
os.makedirs(path, exist_ok=True)
|
||||
|
||||
|
||||
def write_positions(table: pa.Table, ts: datetime) -> str:
|
||||
"""
|
||||
Convert x/y to a Point geometry, drop the originals,
|
||||
and write out a GeoParquet.
|
||||
"""
|
||||
# 1) Extract x/y as numpy arrays (zero_copy_only=False to ensure a NumPy copy)
|
||||
xs = table.column("x").to_numpy()
|
||||
ys = table.column("y").to_numpy()
|
||||
|
||||
# 2) Build a PointType with the correct CRS
|
||||
builder = ga.point().with_crs("EPSG:4326")
|
||||
|
||||
# 3) Create the geometry array directly from the Arrow buffers
|
||||
geom = ga.as_wkb(
|
||||
builder.from_geobuffers(None, xs, ys)
|
||||
) # ensure the geometry is in WKB format
|
||||
|
||||
# 4) drop old coords & append 'geometry'
|
||||
table = table.drop(["x", "y"]).append_column("geometry", geom)
|
||||
|
||||
subdir = ts.strftime("positions/%Y/%m/%d/%H")
|
||||
local_dir = os.path.join(LOCAL_DATA_DIR, subdir)
|
||||
_ensure_dir(local_dir)
|
||||
|
||||
filename = f"{ts.strftime('%M%S')}_{HOST_ID}.parquet"
|
||||
local_path = os.path.join(local_dir, filename)
|
||||
pq.write_table(table, local_path, compression="snappy")
|
||||
|
||||
return local_path
|
||||
|
||||
|
||||
def write_course_posts(table: pa.Table, course_id: int, ts: datetime) -> str:
|
||||
"""
|
||||
Write course-posts data for a single course to Parquet.
|
||||
"""
|
||||
subdir = ts.strftime("courses/%Y-%m-%d")
|
||||
local_dir = os.path.join(LOCAL_DATA_DIR, subdir)
|
||||
_ensure_dir(local_dir)
|
||||
|
||||
filename = f"course_{course_id}_{ts.strftime('%H%M%S')}_{HOST_ID}.parquet"
|
||||
local_path = os.path.join(local_dir, filename)
|
||||
|
||||
pq.write_table(table, local_path, compression="snappy")
|
||||
return local_path
|
||||
|
||||
|
||||
def write_stops(table: pa.Table, snapshot_dt: datetime) -> str:
|
||||
"""
|
||||
Write the daily snapshot of stops (getPosts).
|
||||
"""
|
||||
subdir = snapshot_dt.strftime("stops/%Y-%m-%d")
|
||||
local_dir = os.path.join(LOCAL_DATA_DIR, subdir)
|
||||
_ensure_dir(local_dir)
|
||||
|
||||
local_path = os.path.join(local_dir, f"stops_{HOST_ID}.parquet")
|
||||
pq.write_table(table, local_path, compression="snappy")
|
||||
return local_path
|
||||
|
||||
|
||||
def write_posts_lines(table: pa.Table, snapshot_dt: datetime) -> str:
|
||||
"""
|
||||
Write the daily stop→lines mapping (getPostsLines).
|
||||
"""
|
||||
subdir = snapshot_dt.strftime("stops_lines/%Y-%m-%d")
|
||||
local_dir = os.path.join(LOCAL_DATA_DIR, subdir)
|
||||
_ensure_dir(local_dir)
|
||||
|
||||
local_path = os.path.join(local_dir, f"posts_lines_{HOST_ID}.parquet")
|
||||
pq.write_table(table, local_path, compression="snappy")
|
||||
return local_path
|
||||
|
||||
|
||||
def push_to_nas(local_path: str):
|
||||
"""
|
||||
Push a local file to the configured rclone remote,
|
||||
preserving its relative path under RCLONE_REMOTE_PATH.
|
||||
"""
|
||||
# Compute the path on the remote
|
||||
rel_path = os.path.relpath(local_path, LOCAL_DATA_DIR).replace(os.sep, "/")
|
||||
remote_uri = f"{RCLONE_REMOTE}:{RCLONE_REMOTE_PATH}/{rel_path}"
|
||||
|
||||
# Copy, skipping files that already exist remotely
|
||||
rclone.copy(local_path, remote_uri, ignore_existing=True)
|
||||
|
||||
|
||||
def compact_positions(date_to_compact: date) -> str:
|
||||
"""
|
||||
Read all small Parquet under positions/YYYY/MM/DD,
|
||||
merge into one table, write a single daily file.
|
||||
Returns the path to that daily Parquet.
|
||||
"""
|
||||
# build glob pattern for that day
|
||||
day_glob = os.path.join(
|
||||
LOCAL_DATA_DIR, f"positions/{date_to_compact:%Y/%m/%d}/**/*.parquet"
|
||||
)
|
||||
# load dataset
|
||||
dataset = ds.dataset(day_glob, format="parquet", partitioning="hive")
|
||||
table = dataset.to_table() # merges all batches
|
||||
|
||||
# write one daily file
|
||||
out_dir = os.path.join(LOCAL_DATA_DIR, "positions_compacted")
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
out_path = os.path.join(out_dir, f"{date_to_compact.isoformat()}_{HOST_ID}.parquet")
|
||||
pq.write_table(table, out_path, compression="snappy")
|
||||
return out_path
|
||||
|
||||
|
||||
def cleanup_small_positions(date_to_remove: date):
|
||||
"""
|
||||
Optionally delete the raw minute files once compacted.
|
||||
"""
|
||||
base = os.path.join(LOCAL_DATA_DIR, f"positions/{date_to_remove:%Y/%m/%d}")
|
||||
for path in glob.glob(f"{base}/**/*.parquet", recursive=True):
|
||||
try:
|
||||
os.remove(path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def compact_course_posts(date_to_compact: date) -> str:
|
||||
"""
|
||||
Read all shards under courses/YYYY-MM-DD/*.parquet,
|
||||
merge into one table, write a single daily file.
|
||||
"""
|
||||
pattern = os.path.join(
|
||||
LOCAL_DATA_DIR, f"courses/{date_to_compact:%Y-%m-%d}/*.parquet"
|
||||
)
|
||||
dataset = ds.dataset(pattern, format="parquet")
|
||||
table = dataset.to_table()
|
||||
|
||||
out_dir = os.path.join(LOCAL_DATA_DIR, "courses_compacted")
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
out_path = os.path.join(out_dir, f"{date_to_compact.isoformat()}_{HOST_ID}.parquet")
|
||||
pq.write_table(table, out_path, compression="snappy")
|
||||
return out_path
|
||||
|
||||
|
||||
def cleanup_small_course_posts(date_to_remove: date):
|
||||
"""
|
||||
Delete the original course shards once compacted.
|
||||
"""
|
||||
base = os.path.join(LOCAL_DATA_DIR, f"courses/{date_to_remove:%Y-%m-%d}")
|
||||
for fn in glob.glob(f"{base}/*.parquet"):
|
||||
try:
|
||||
os.remove(fn)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def write_course_geometry(polyline_str: str, course_id: int, ts: datetime) -> str:
|
||||
"""
|
||||
Decode the Google polyline into a GeoArrow LineString WKB,
|
||||
attach EPSG:4326, and write out a GeoParquet.
|
||||
"""
|
||||
# 1) Decode → list of (lon, lat)
|
||||
coords = polyline.decode(polyline_str, geojson=True)
|
||||
|
||||
# 2) Build a WKT LINESTRING
|
||||
pts = ", ".join(f"{x} {y}" for x, y in coords)
|
||||
wkt = f"LINESTRING({pts})"
|
||||
|
||||
# 3) Convert that WKT to a WKB ExtensionArray
|
||||
# as_wkb() will parse each WKT string into WKB
|
||||
wkb_arr = ga.as_wkb([wkt])
|
||||
|
||||
# 4) Cast into a WkbType with EPSG:4326 built in
|
||||
wkb_with_crs = wkb_arr.cast(
|
||||
ga.wkb().with_crs("EPSG:4326")
|
||||
)
|
||||
|
||||
# 5) Build a 1-row table
|
||||
table = pa.Table.from_arrays(
|
||||
[
|
||||
pa.array([ts]),
|
||||
pa.array([course_id]),
|
||||
wkb_with_crs
|
||||
],
|
||||
names=["fetch_date", "course_id", "geometry"],
|
||||
)
|
||||
|
||||
# 6) Write out the GeoParquet
|
||||
subdir = ts.strftime("courses_geometry/%Y-%m-%d")
|
||||
out_dir = os.path.join(LOCAL_DATA_DIR, subdir)
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
|
||||
fn = f"shape_{course_id}_{ts.strftime('%H%M%S')}.parquet"
|
||||
out_path = os.path.join(out_dir, fn)
|
||||
pq.write_table(table, out_path, compression="snappy")
|
||||
return out_path
|
||||
|
||||
|
||||
def compact_course_geometry(date_to_compact: date) -> str:
|
||||
"""
|
||||
Read all geometry shards under courses_geometry/YYYY-MM-DD/*.parquet,
|
||||
merge into one table, write a single daily file.
|
||||
Returns the path to that daily Parquet.
|
||||
"""
|
||||
pattern = os.path.join(
|
||||
LOCAL_DATA_DIR, f"courses_geometry/{date_to_compact:%Y-%m-%d}/*.parquet"
|
||||
)
|
||||
dataset = ds.dataset(pattern, format="parquet")
|
||||
table = dataset.to_table()
|
||||
|
||||
out_dir = os.path.join(LOCAL_DATA_DIR, "courses_geometry_compacted")
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
out_path = os.path.join(out_dir, f"{date_to_compact.isoformat()}_{HOST_ID}.parquet")
|
||||
pq.write_table(table, out_path, compression="snappy")
|
||||
return out_path
|
||||
|
||||
|
||||
def cleanup_small_course_geometry(date_to_remove: date):
|
||||
"""
|
||||
Delete the original course-geometry shards once compacted.
|
||||
"""
|
||||
base = os.path.join(LOCAL_DATA_DIR, f"courses_geometry/{date_to_remove:%Y-%m-%d}")
|
||||
for fn in glob.glob(f"{base}/*.parquet"):
|
||||
try:
|
||||
os.remove(fn)
|
||||
except OSError:
|
||||
pass
|
||||
Loading…
Add table
Add a link
Reference in a new issue