-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Description
Related to issue #5
Code for list of stations (and metadata #1):
import requests
import pandas as pd
import numpy as np
# Mapping between INA variable IDs and their English names
VARIABLE_ENGLISH = {
4: "discharge-instantaneous",
40: "discharge-daily-mean",
68: "discharge-daily-max",
69: "discharge-daily-min",
87: "discharge-hourly",
2: "stage-instantaneous",
39: "stage-daily-mean",
85: "stage-hourly",
73: "water-temperature",
}
def get_ina_metadata(var_ids=None) -> pd.DataFrame:
"""
Fetch and return full metadata for INA (Argentina) observation series.
- Keeps ALL original JSON fields from the API.
- Renames only the standardized ones:
id → gauge_id
estacion.nombre → station_name
estacion.rio → river
estacion.geom.coordinates → [longitude, latitude]
- Adds or fills standardized fields with NaN if missing:
['gauge_id', 'station_name', 'river', 'latitude', 'longitude',
'altitude', 'area', 'country', 'source', 'variable']
- Adds constant fields:
country='Argentina', source='INA (DSIyAH)'
- Prints the URL being fetched for transparency.
Source: https://alerta.ina.gob.ar/a5/obs/puntual/series
"""
if var_ids is None:
var_ids = list(VARIABLE_ENGLISH.keys())
base_url = "https://alerta.ina.gob.ar/a5/obs/puntual/series"
all_records = []
try:
for vid in var_ids:
r = requests.get(base_url, params={"var_id": vid}, timeout=30)
r.raise_for_status()
# API can return either {"rows": [...]} or a list directly
data = r.json().get("rows", r.json())
if not isinstance(data, list):
continue
for s in data:
est = s.get("estacion", {})
geom = est.get("geom") or {}
coords = geom.get("coordinates", [None, None])
# Base record for standardized fields
rec = {
"gauge_id": est.get("id"),
"series_id": s.get("id"),
"station_name": est.get("nombre"),
"river": est.get("rio"),
"latitude": coords[1],
"longitude": coords[0],
"altitude": None,
"area": np.nan,
"country": est.get("pais", "Argentina"),
"source": "Sistema de Información Hidrológica de la Cuenca del Plata - DSIyAH INA",
"variable": VARIABLE_ENGLISH.get(vid),
}
# Include all raw fields (flatten nested dicts if possible)
flat_series = pd.json_normalize(s)
for k, v in flat_series.to_dict(orient="records")[0].items():
if k not in rec:
rec[k] = v
all_records.append(rec)
df = pd.DataFrame(all_records)
# Ensure standardized fields exist
std_cols = [
"gauge_id",
"station_name",
"river",
"latitude",
"longitude",
"altitude",
"area",
"country",
"source",
"variable",
]
for col in std_cols:
if col not in df.columns:
df[col] = np.nan
# Convert coordinate types
df["latitude"] = pd.to_numeric(df["latitude"], errors="coerce")
df["longitude"] = pd.to_numeric(df["longitude"], errors="coerce")
# Keep ALL columns
return df.reset_index(drop=True)
except Exception as e:
print(f"Failed to fetch INA metadata: {e}")
return pd.DataFrame()
Code for downloading the data
Here I have a question on how to proceed: For Argentina, we need the gauge_id + the series_id for the given variable for the given gauge (unique code for gauge). So in theory 2 codes. The current implementation requires the user to place both. What do you think is the best solution? @kratzert and @simonmoulds ? I thought about using the cached_data, since the argentina_sites.csv file would have both the gauge_id and the series_id.
import requests
import pandas as pd
import numpy as np
from typing import Optional
VARIABLE_MAP = {
"discharge-instantaneous": 4,
"discharge-daily-mean": 40,
"discharge-daily-max": 68,
"discharge-daily-min": 69,
"discharge-hourly": 87,
"stage-instantaneous": 2,
"stage-daily-mean": 39,
"stage-hourly": 85,
"water-temperature": 73,
}
def get_ina_data(
gauge_id: str | int,
series_id: str | int,
variable: str,
start_date: Optional[str] = None,
end_date: Optional[str] = None,
show_url: bool = False,
) -> pd.DataFrame:
"""
Download time series from INA using your exact URL pattern
(query string begins with '&' and no 'request=datos').
Source and description: https://alerta.ina.gob.ar/pub/gui/apibase
Returns a DataFrame with columns: ['time', '<variable>'].
"""
variable = variable.lower()
if variable not in VARIABLE_MAP:
raise ValueError(f"Invalid variable '{variable}'. Allowed: {list(VARIABLE_MAP.keys())}")
if not start_date:
start_date = "1900-01-01" # default
if not end_date:
end_date = pd.Timestamp.now().date().strftime("%Y-%m-%d") # today
var_id = VARIABLE_MAP[variable]
base_url = "https://alerta.ina.gob.ar/pub/datos/datos"
url = (
f"{base_url}"
f"&timeStart={start_date}"
f"&timeEnd={end_date}"
f"&seriesId={series_id}"
f"&siteCode={gauge_id}"
f"&varId={var_id}"
f"&format=json"
)
if show_url:
print("Requesting URL:", url)
r = requests.get(url, timeout=30)
r.raise_for_status()
js = r.json()
data = js.get("data", [])
if not data:
return pd.DataFrame(columns=["time", variable])
df = pd.DataFrame(data)
if "timestart" not in df or "valor" not in df:
return pd.DataFrame(columns=["time", variable])
df = df.rename(columns={"timestart": "time", "valor": variable})
df["time"] = pd.to_datetime(df["time"], errors="coerce")
df[variable] = pd.to_numeric(df[variable], errors="coerce")
df = df.dropna(subset=["time"])
df = df.drop_duplicates(subset="time", keep="first")
df = df.sort_values("time").reset_index(drop=True)
df = df[["time", variable]]
return df
Metadata
Metadata
Assignees
Labels
No labels