Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revise metadata filepaths and module names #122

Merged
merged 10 commits into from
Jul 1, 2024
4 changes: 2 additions & 2 deletions python/popgetter/assets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from . import be, ni, uk, us
from . import bel, gb_nir, uk, us

countries = [(mod, mod.__name__.split(".")[-1]) for mod in [be, ni, uk, us]]
countries = [(mod, mod.__name__.split(".")[-1]) for mod in [bel, gb_nir, uk, us]]

__all__ = ["countries"]
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
)

WORKING_DIR = Path("belgium")
asset_prefix = "be"
asset_prefix = "bel"
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,10 @@ def derived_metrics_by_partition(
derived_metrics: list[pd.DataFrame] = []
derived_mmd: list[MetricMetadata] = []

parquet_file_name = "".join(c for c in node if c.isalnum()) + ".parquet"
parquet_file_name = (
f"{asset_prefix}/metrics/"
f"{''.join(c for c in node if c.isalnum()) + '.parquet'}"
)

for metric_spec in metric_specs:
new_table = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
)
from popgetter.utils import markdown_from_plot

from .belgium import asset_prefix
from .belgium import asset_prefix, country
from .census_tables import publisher


Expand Down Expand Up @@ -115,6 +115,7 @@ def geometry(context, sector_geometries) -> list[GeometryOutput]:

for level_details in BELGIUM_GEOMETRY_LEVELS.values():
geometry_metadata = GeometryMetadata(
country_metadata=country,
validity_period_start=date(2023, 1, 1),
validity_period_end=date(2023, 12, 31),
level=level_details.level,
Expand Down
9 changes: 5 additions & 4 deletions python/popgetter/assets/country.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@ class Country(ABC):

"""

key_prefix: ClassVar[str]
country_metadata: ClassVar[CountryMetadata]
key_prefix: str
partition_name: str
dataset_node_partition: DynamicPartitionsDefinition

def __init__(self):
self.key_prefix = self.country_metadata.id
self.partition_name = f"{self.key_prefix}_nodes"
self.dataset_node_partition = DynamicPartitionsDefinition(
name=self.partition_name
Expand Down Expand Up @@ -85,9 +87,8 @@ def country_metadata(context):

return country_metadata

@abstractmethod
def _country_metadata(self, context) -> CountryMetadata:
...
def _country_metadata(self, _context) -> CountryMetadata:
return self.country_metadata

def create_data_publisher(self):
"""Creates an asset providing the data publisher metadata."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,19 +241,16 @@ def census_table_metadata(


class NorthernIreland(Country):
key_prefix: ClassVar[str] = "uk-ni"
country_metadata: CountryMetadata = CountryMetadata(
name_short_en="Northern Ireland",
name_official="Northern Ireland",
iso3="GBR",
iso2="GB",
iso3166_2="GB-NIR",
)
geo_levels: ClassVar[list[str]] = list(NI_GEO_LEVELS.keys())
tables_to_process: list[str] | None = TABLES_TO_PROCESS

def _country_metadata(self, _context) -> CountryMetadata:
return CountryMetadata(
name_short_en="Northern Ireland",
name_official="Northern Ireland",
iso3="GBR",
iso2="GB",
iso3166_2="GB-NIR",
)

def _data_publisher(
self, _context, country_metadata: CountryMetadata
) -> DataPublisher:
Expand Down Expand Up @@ -394,6 +391,7 @@ def _geometry(self, context) -> list[GeometryOutput]:
for level_details in NI_GEO_LEVELS.values():
# TODO: get correct values
geometry_metadata = GeometryMetadata(
country_metadata=self.country_metadata,
validity_period_start=CENSUS_COLLECTION_DATE,
validity_period_end=CENSUS_COLLECTION_DATE,
level=level_details.level,
Expand Down Expand Up @@ -538,7 +536,8 @@ def _derived_metrics(
)

parquet_file_name = (
"".join(c for c in partition_key if c.isalnum()) + ".parquet"
f"{self.key_prefix}/metrics/"
f"{''.join(c for c in partition_key if c.isalnum()) + '.parquet'}"
)
derived_metrics: list[pd.DataFrame] = []
derived_mmd: list[MetricMetadata] = []
Expand Down
30 changes: 8 additions & 22 deletions python/popgetter/io_managers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,27 +132,15 @@ class GeometryOutputPaths:

def get_full_paths_geoms(
self,
context: OutputContext,
geo_metadata: GeometryMetadata,
) -> GeometryOutputPaths:
filename_stem = geo_metadata.filename_stem
asset_prefix = list(context.partition_key.split("/"))[:-1] # e.g. ['be']
filepath_stem = geo_metadata.filename_stem
base_path = self.get_base_path()
return self.GeometryOutputPaths(
flatgeobuf=base_path
/ UPath("/".join([*asset_prefix, "geometries", f"{filename_stem}.fgb"])),
pmtiles=base_path
/ UPath(
"/".join([*asset_prefix, "geometries", f"TODO_{filename_stem}.pmtiles"])
),
geojsonseq=base_path
/ UPath(
"/".join([*asset_prefix, "geometries", f"{filename_stem}.geojsonseq"])
),
names=base_path
/ UPath(
"/".join([*asset_prefix, "geometries", f"{filename_stem}.parquet"])
),
flatgeobuf=base_path / UPath(f"{filepath_stem}.fgb"),
pmtiles=base_path / UPath(f"TODO_{filepath_stem}.pmtiles"),
geojsonseq=base_path / UPath(f"{filepath_stem}.geojsonseq"),
names=base_path / UPath(f"{filepath_stem}.parquet"),
)

def get_full_path_metadata(
Expand Down Expand Up @@ -217,7 +205,7 @@ def handle_output(
output.gdf["GEO_ID"] = output.gdf["GEO_ID"].astype("string")
output.names_df = output.names_df.astype("string")

full_paths = self.get_full_paths_geoms(context, output.metadata)
full_paths = self.get_full_paths_geoms(output.metadata)
penelopeysm marked this conversation as resolved.
Show resolved Hide resolved

self.handle_flatgeobuf(context, output.gdf, full_paths.flatgeobuf)
self.handle_geojsonseq(context, output.gdf, full_paths.geojsonseq)
Expand Down Expand Up @@ -253,12 +241,10 @@ def get_full_path_metadata(

def get_full_path_metrics(
self,
context: OutputContext,
parquet_path: str,
) -> UPath:
base_path = self.get_base_path()
asset_prefix = list(context.partition_key.split("/"))[:-1]
return base_path / UPath("/".join([*asset_prefix, "metrics", parquet_path]))
return base_path / UPath(parquet_path)

def handle_output(
self,
Expand Down Expand Up @@ -329,7 +315,7 @@ def handle_output(
# of the tuple
for metrics_output in obj:
rel_path = metrics_output.metadata[0].metric_parquet_path
full_path = self.get_full_path_metrics(context, rel_path)
full_path = self.get_full_path_metrics(rel_path)
self.handle_df(context, metrics_output.metrics, full_path)

# Add metadata
Expand Down
38 changes: 28 additions & 10 deletions python/popgetter/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,28 @@ def hash_class_vars(self):
Note that `vars()` does not include properties, so the IDs themselves are
not part of the hash, which avoids self-reference issues.
"""

# Must copy the dict to avoid overriding the actual instance attributes!
# Because we're only modifying dates -> strings, we don't need to perform a
# deepcopy
variables = dict(**vars(self))
# Python doesn't serialise dates to JSON, have to convert to ISO 8601 first
for key, val in variables.items():
if isinstance(val, date):
variables[key] = val.isoformat()
return sha256(jcs.canonicalize(variables)).hexdigest()
# deepcopy but all variables must be serializable
def serializable_vars(obj: object) -> dict:
penelopeysm marked this conversation as resolved.
Show resolved Hide resolved
variables = {}
# Check if variables are serializable
for key, val in vars(obj).items():
try:
jcs.canonicalize(val)
variables[key] = val
except Exception:
pass

# Python doesn't serialise dates to JSON, have to convert to ISO 8601 first
for key, val in variables.items():
if isinstance(val, date):
variables[key] = val.isoformat()

return variables

return sha256(jcs.canonicalize(serializable_vars(self))).hexdigest()

@classmethod
def fix_types(cls, df: pd.DataFrame) -> pd.DataFrame:
Expand Down Expand Up @@ -67,8 +80,8 @@ class CountryMetadata(MetadataBaseModel):
@property
def id(self) -> str:
if self.iso3166_2 is not None:
return self.iso3166_2.lower()
return self.iso3.lower()
return self.iso3166_2.lower().replace("-", "_")
return self.iso3.lower().replace("-", "_")

name_short_en: str = Field(
description="The short name of the country in English (for example 'Belgium')."
Expand Down Expand Up @@ -119,10 +132,15 @@ def id(self) -> str:

@computed_field
@property
# TODO: update metadata field name to `filepath_stem` (https://github.com/Urban-Analytics-Technology-Platform/popgetter/issues/129)
def filename_stem(self) -> str:
level = "_".join(self.level.lower().split())
year = self.validity_period_start.year
return f"{level}_{year}"
return f"{self.country_metadata.id}/geometries/{level}_{year}"

country_metadata: CountryMetadata = Field(
"The `CountryMetadata` associated with the geometry.", exclude=True
)

validity_period_start: date = Field(
description="The start of the range of time for which the regions are valid (inclusive)"
Expand Down
20 changes: 10 additions & 10 deletions tests/test_be.py → tests/test_bel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from rdflib import Graph
from rdflib.namespace import DCAT

from popgetter.assets import be
from popgetter.assets import bel


@pytest.fixture(scope="module")
Expand All @@ -36,7 +36,7 @@ def demo_catalog() -> Graph:
@pytest.fixture(scope="module")
def demo_catalog_df(demo_catalog) -> pd.DataFrame:
context = build_asset_context()
return be.census_tables.catalog_as_dataframe(context, demo_catalog)
return bel.census_tables.catalog_as_dataframe(context, demo_catalog)


@pytest.mark.skip(
Expand All @@ -46,7 +46,7 @@ def test_aggregate_sectors_to_municipalities(demo_sectors):
# Test the that the row count is correctly added to the metadata
context = build_asset_context()

actual_municipalities = be.census_geometry.aggregate_sectors_to_municipalities(
actual_municipalities = bel.census_geometry.aggregate_sectors_to_municipalities(
context, demo_sectors
)

Expand All @@ -62,7 +62,7 @@ def test_aggregate_sectors_to_municipalities(demo_sectors):
@pytest.mark.skip(reason="Fix test_get_population_details_per_municipality first")
def test_get_population_details_per_municipality():
with build_asset_context() as muni_context:
stat_muni = be.census_tables.get_population_details_per_municipality(
stat_muni = bel.census_tables.get_population_details_per_municipality(
muni_context
)

Expand All @@ -87,7 +87,7 @@ def test_pivot_population():
)

# Get the geometries
stat_muni = be.census_tables.get_population_details_per_municipality(
stat_muni = bel.census_tables.get_population_details_per_municipality(
muni_context
)

Expand All @@ -99,7 +99,7 @@ def test_pivot_population():

with build_asset_context() as pivot_context:
# Pivot the population
pivoted = be.pivot_population(pivot_context, stat_muni)
pivoted = bel.pivot_population(pivot_context, stat_muni)

expected_number_of_municipalities = 581

Expand All @@ -115,7 +115,7 @@ def test_demo_catalog(demo_catalog):
actual_length = len(
list(
demo_catalog.objects(
subject=be.census_tables.opendata_catalog_root,
subject=bel.census_tables.opendata_catalog_root,
predicate=DCAT.dataset,
unique=False,
)
Expand All @@ -128,7 +128,7 @@ def test_demo_catalog(demo_catalog):
def test_catalog_metadata_details(demo_catalog_df):
# Get the metadata for a specific dataset in the demo catalogue:
# https://statbel.fgov.be/node/4151 "Population by Statistical sector"
# mmd = be.census_tables.get_mmd_from_dataset_node(
# mmd = bel.census_tables.get_mmd_from_dataset_node(
# demo_catalog, dataset_node=URIRef("https://statbel.fgov.be/node/4151")
# )

Expand Down Expand Up @@ -179,7 +179,7 @@ def test_catalog_as_dataframe(demo_catalog_df):

# # Convert the demo catalog to a DataFrame
# with build_asset_context() as context:
# catalog_df = be.census_tables.catalog_as_dataframe(context, demo_catalog_df)
# catalog_df = bel.census_tables.catalog_as_dataframe(context, demo_catalog_df)

# # Check that the catalog has been converted to a DataFrame
# assert isinstance(catalog_df, pd.DataFrame)
Expand Down Expand Up @@ -228,7 +228,7 @@ def test_filter_known_failing_datasets():
"2676",
]

actual_list = be.census_tables.filter_known_failing_datasets(mock_catalog)
actual_list = bel.census_tables.filter_known_failing_datasets(mock_catalog)

assert mock_catalog != expected_list
assert actual_list != mock_catalog
Expand Down
2 changes: 1 addition & 1 deletion tests/test_cloud_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# )
# generate_pmtiles,
# TODO, Move this to a fixture to somewhere more universal
from .test_be import demo_sectors # noqa: F401
from .test_bel import demo_sectors # noqa: F401

# Commented out test as part of #92 as functions no longer importable
# @pytest.mark.skip(
Expand Down
2 changes: 1 addition & 1 deletion tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def test_source_data_release_hash():
)
assert (
source_data_release.id
== "9ec7e234d73664339e4c1f04bfa485dbb17e204dd72dc3ffbb9cab6870475597"
== "4d61bfe401ba17becd02d6b3912152c135daa9ecaebc9bd45a589dc831a85217"
)

source_data_release2 = SourceDataRelease(
Expand Down
Loading