diff --git a/python/popgetter/assets/__init__.py b/python/popgetter/assets/__init__.py index 55e91dc..1cb7eb6 100644 --- a/python/popgetter/assets/__init__.py +++ b/python/popgetter/assets/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations -from . import be, ni, uk, us +from . import bel, gb_nir, uk, us -countries = [(mod, mod.__name__.split(".")[-1]) for mod in [be, ni, uk, us]] +countries = [(mod, mod.__name__.split(".")[-1]) for mod in [bel, gb_nir, uk, us]] __all__ = ["countries"] diff --git a/python/popgetter/assets/be/__init__.py b/python/popgetter/assets/bel/__init__.py similarity index 100% rename from python/popgetter/assets/be/__init__.py rename to python/popgetter/assets/bel/__init__.py diff --git a/python/popgetter/assets/be/belgium.py b/python/popgetter/assets/bel/belgium.py similarity index 93% rename from python/popgetter/assets/be/belgium.py rename to python/popgetter/assets/bel/belgium.py index 45403af..d3d0f89 100644 --- a/python/popgetter/assets/be/belgium.py +++ b/python/popgetter/assets/bel/belgium.py @@ -13,4 +13,4 @@ ) WORKING_DIR = Path("belgium") -asset_prefix = "be" +asset_prefix = "bel" diff --git a/python/popgetter/assets/be/census_derived.py b/python/popgetter/assets/bel/census_derived.py similarity index 98% rename from python/popgetter/assets/be/census_derived.py rename to python/popgetter/assets/bel/census_derived.py index 3aec6c9..2f4986d 100644 --- a/python/popgetter/assets/be/census_derived.py +++ b/python/popgetter/assets/bel/census_derived.py @@ -275,7 +275,10 @@ def derived_metrics_by_partition( derived_metrics: list[pd.DataFrame] = [] derived_mmd: list[MetricMetadata] = [] - parquet_file_name = "".join(c for c in node if c.isalnum()) + ".parquet" + parquet_file_name = ( + f"{asset_prefix}/metrics/" + f"{''.join(c for c in node if c.isalnum()) + '.parquet'}" + ) for metric_spec in metric_specs: new_table = ( diff --git a/python/popgetter/assets/be/census_geometry.py b/python/popgetter/assets/bel/census_geometry.py similarity index 98% rename from python/popgetter/assets/be/census_geometry.py rename to python/popgetter/assets/bel/census_geometry.py index c07cf94..a0c22e2 100644 --- a/python/popgetter/assets/be/census_geometry.py +++ b/python/popgetter/assets/bel/census_geometry.py @@ -23,7 +23,7 @@ ) from popgetter.utils import markdown_from_plot -from .belgium import asset_prefix +from .belgium import asset_prefix, country from .census_tables import publisher @@ -115,6 +115,7 @@ def geometry(context, sector_geometries) -> list[GeometryOutput]: for level_details in BELGIUM_GEOMETRY_LEVELS.values(): geometry_metadata = GeometryMetadata( + country_metadata=country, validity_period_start=date(2023, 1, 1), validity_period_end=date(2023, 12, 31), level=level_details.level, diff --git a/python/popgetter/assets/be/census_tables.py b/python/popgetter/assets/bel/census_tables.py similarity index 100% rename from python/popgetter/assets/be/census_tables.py rename to python/popgetter/assets/bel/census_tables.py diff --git a/python/popgetter/assets/country.py b/python/popgetter/assets/country.py index 448d3a9..4349c97 100644 --- a/python/popgetter/assets/country.py +++ b/python/popgetter/assets/country.py @@ -38,11 +38,13 @@ class Country(ABC): """ - key_prefix: ClassVar[str] + country_metadata: ClassVar[CountryMetadata] + key_prefix: str partition_name: str dataset_node_partition: DynamicPartitionsDefinition def __init__(self): + self.key_prefix = self.country_metadata.id self.partition_name = f"{self.key_prefix}_nodes" self.dataset_node_partition = DynamicPartitionsDefinition( name=self.partition_name @@ -85,9 +87,8 @@ def country_metadata(context): return country_metadata - @abstractmethod - def _country_metadata(self, context) -> CountryMetadata: - ... + def _country_metadata(self, _context) -> CountryMetadata: + return self.country_metadata def create_data_publisher(self): """Creates an asset providing the data publisher metadata.""" diff --git a/python/popgetter/assets/ni/README.md b/python/popgetter/assets/gb_nir/README.md similarity index 100% rename from python/popgetter/assets/ni/README.md rename to python/popgetter/assets/gb_nir/README.md diff --git a/python/popgetter/assets/ni/__init__.py b/python/popgetter/assets/gb_nir/__init__.py similarity index 98% rename from python/popgetter/assets/ni/__init__.py rename to python/popgetter/assets/gb_nir/__init__.py index 06c3715..740538b 100644 --- a/python/popgetter/assets/ni/__init__.py +++ b/python/popgetter/assets/gb_nir/__init__.py @@ -241,19 +241,16 @@ def census_table_metadata( class NorthernIreland(Country): - key_prefix: ClassVar[str] = "uk-ni" + country_metadata: CountryMetadata = CountryMetadata( + name_short_en="Northern Ireland", + name_official="Northern Ireland", + iso3="GBR", + iso2="GB", + iso3166_2="GB-NIR", + ) geo_levels: ClassVar[list[str]] = list(NI_GEO_LEVELS.keys()) tables_to_process: list[str] | None = TABLES_TO_PROCESS - def _country_metadata(self, _context) -> CountryMetadata: - return CountryMetadata( - name_short_en="Northern Ireland", - name_official="Northern Ireland", - iso3="GBR", - iso2="GB", - iso3166_2="GB-NIR", - ) - def _data_publisher( self, _context, country_metadata: CountryMetadata ) -> DataPublisher: @@ -394,6 +391,7 @@ def _geometry(self, context) -> list[GeometryOutput]: for level_details in NI_GEO_LEVELS.values(): # TODO: get correct values geometry_metadata = GeometryMetadata( + country_metadata=self.country_metadata, validity_period_start=CENSUS_COLLECTION_DATE, validity_period_end=CENSUS_COLLECTION_DATE, level=level_details.level, @@ -538,7 +536,8 @@ def _derived_metrics( ) parquet_file_name = ( - "".join(c for c in partition_key if c.isalnum()) + ".parquet" + f"{self.key_prefix}/metrics/" + f"{''.join(c for c in partition_key if c.isalnum()) + '.parquet'}" ) derived_metrics: list[pd.DataFrame] = [] derived_mmd: list[MetricMetadata] = [] diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index 3a524e7..611f313 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -132,27 +132,15 @@ class GeometryOutputPaths: def get_full_paths_geoms( self, - context: OutputContext, geo_metadata: GeometryMetadata, ) -> GeometryOutputPaths: - filename_stem = geo_metadata.filename_stem - asset_prefix = list(context.partition_key.split("/"))[:-1] # e.g. ['be'] + filepath_stem = geo_metadata.filename_stem base_path = self.get_base_path() return self.GeometryOutputPaths( - flatgeobuf=base_path - / UPath("/".join([*asset_prefix, "geometries", f"{filename_stem}.fgb"])), - pmtiles=base_path - / UPath( - "/".join([*asset_prefix, "geometries", f"TODO_{filename_stem}.pmtiles"]) - ), - geojsonseq=base_path - / UPath( - "/".join([*asset_prefix, "geometries", f"{filename_stem}.geojsonseq"]) - ), - names=base_path - / UPath( - "/".join([*asset_prefix, "geometries", f"{filename_stem}.parquet"]) - ), + flatgeobuf=base_path / UPath(f"{filepath_stem}.fgb"), + pmtiles=base_path / UPath(f"TODO_{filepath_stem}.pmtiles"), + geojsonseq=base_path / UPath(f"{filepath_stem}.geojsonseq"), + names=base_path / UPath(f"{filepath_stem}.parquet"), ) def get_full_path_metadata( @@ -217,7 +205,7 @@ def handle_output( output.gdf["GEO_ID"] = output.gdf["GEO_ID"].astype("string") output.names_df = output.names_df.astype("string") - full_paths = self.get_full_paths_geoms(context, output.metadata) + full_paths = self.get_full_paths_geoms(output.metadata) self.handle_flatgeobuf(context, output.gdf, full_paths.flatgeobuf) self.handle_geojsonseq(context, output.gdf, full_paths.geojsonseq) @@ -253,12 +241,10 @@ def get_full_path_metadata( def get_full_path_metrics( self, - context: OutputContext, parquet_path: str, ) -> UPath: base_path = self.get_base_path() - asset_prefix = list(context.partition_key.split("/"))[:-1] - return base_path / UPath("/".join([*asset_prefix, "metrics", parquet_path])) + return base_path / UPath(parquet_path) def handle_output( self, @@ -329,7 +315,7 @@ def handle_output( # of the tuple for metrics_output in obj: rel_path = metrics_output.metadata[0].metric_parquet_path - full_path = self.get_full_path_metrics(context, rel_path) + full_path = self.get_full_path_metrics(rel_path) self.handle_df(context, metrics_output.metrics, full_path) # Add metadata diff --git a/python/popgetter/metadata.py b/python/popgetter/metadata.py index a49032a..16107d9 100644 --- a/python/popgetter/metadata.py +++ b/python/popgetter/metadata.py @@ -19,15 +19,28 @@ def hash_class_vars(self): Note that `vars()` does not include properties, so the IDs themselves are not part of the hash, which avoids self-reference issues. """ + # Must copy the dict to avoid overriding the actual instance attributes! # Because we're only modifying dates -> strings, we don't need to perform a - # deepcopy - variables = dict(**vars(self)) - # Python doesn't serialise dates to JSON, have to convert to ISO 8601 first - for key, val in variables.items(): - if isinstance(val, date): - variables[key] = val.isoformat() - return sha256(jcs.canonicalize(variables)).hexdigest() + # deepcopy but all variables must be serializable + def serializable_vars(obj: object) -> dict: + variables = {} + # Check if variables are serializable + for key, val in vars(obj).items(): + try: + jcs.canonicalize(val) + variables[key] = val + except Exception: + pass + + # Python doesn't serialise dates to JSON, have to convert to ISO 8601 first + for key, val in variables.items(): + if isinstance(val, date): + variables[key] = val.isoformat() + + return variables + + return sha256(jcs.canonicalize(serializable_vars(self))).hexdigest() @classmethod def fix_types(cls, df: pd.DataFrame) -> pd.DataFrame: @@ -67,8 +80,8 @@ class CountryMetadata(MetadataBaseModel): @property def id(self) -> str: if self.iso3166_2 is not None: - return self.iso3166_2.lower() - return self.iso3.lower() + return self.iso3166_2.lower().replace("-", "_") + return self.iso3.lower().replace("-", "_") name_short_en: str = Field( description="The short name of the country in English (for example 'Belgium')." @@ -119,10 +132,15 @@ def id(self) -> str: @computed_field @property + # TODO: update metadata field name to `filepath_stem` (https://github.com/Urban-Analytics-Technology-Platform/popgetter/issues/129) def filename_stem(self) -> str: level = "_".join(self.level.lower().split()) year = self.validity_period_start.year - return f"{level}_{year}" + return f"{self.country_metadata.id}/geometries/{level}_{year}" + + country_metadata: CountryMetadata = Field( + "The `CountryMetadata` associated with the geometry.", exclude=True + ) validity_period_start: date = Field( description="The start of the range of time for which the regions are valid (inclusive)" diff --git a/tests/test_be.py b/tests/test_bel.py similarity index 91% rename from tests/test_be.py rename to tests/test_bel.py index fae975b..b08a6da 100644 --- a/tests/test_be.py +++ b/tests/test_bel.py @@ -12,7 +12,7 @@ from rdflib import Graph from rdflib.namespace import DCAT -from popgetter.assets import be +from popgetter.assets import bel @pytest.fixture(scope="module") @@ -36,7 +36,7 @@ def demo_catalog() -> Graph: @pytest.fixture(scope="module") def demo_catalog_df(demo_catalog) -> pd.DataFrame: context = build_asset_context() - return be.census_tables.catalog_as_dataframe(context, demo_catalog) + return bel.census_tables.catalog_as_dataframe(context, demo_catalog) @pytest.mark.skip( @@ -46,7 +46,7 @@ def test_aggregate_sectors_to_municipalities(demo_sectors): # Test the that the row count is correctly added to the metadata context = build_asset_context() - actual_municipalities = be.census_geometry.aggregate_sectors_to_municipalities( + actual_municipalities = bel.census_geometry.aggregate_sectors_to_municipalities( context, demo_sectors ) @@ -62,7 +62,7 @@ def test_aggregate_sectors_to_municipalities(demo_sectors): @pytest.mark.skip(reason="Fix test_get_population_details_per_municipality first") def test_get_population_details_per_municipality(): with build_asset_context() as muni_context: - stat_muni = be.census_tables.get_population_details_per_municipality( + stat_muni = bel.census_tables.get_population_details_per_municipality( muni_context ) @@ -87,7 +87,7 @@ def test_pivot_population(): ) # Get the geometries - stat_muni = be.census_tables.get_population_details_per_municipality( + stat_muni = bel.census_tables.get_population_details_per_municipality( muni_context ) @@ -99,7 +99,7 @@ def test_pivot_population(): with build_asset_context() as pivot_context: # Pivot the population - pivoted = be.pivot_population(pivot_context, stat_muni) + pivoted = bel.pivot_population(pivot_context, stat_muni) expected_number_of_municipalities = 581 @@ -115,7 +115,7 @@ def test_demo_catalog(demo_catalog): actual_length = len( list( demo_catalog.objects( - subject=be.census_tables.opendata_catalog_root, + subject=bel.census_tables.opendata_catalog_root, predicate=DCAT.dataset, unique=False, ) @@ -128,7 +128,7 @@ def test_demo_catalog(demo_catalog): def test_catalog_metadata_details(demo_catalog_df): # Get the metadata for a specific dataset in the demo catalogue: # https://statbel.fgov.be/node/4151 "Population by Statistical sector" - # mmd = be.census_tables.get_mmd_from_dataset_node( + # mmd = bel.census_tables.get_mmd_from_dataset_node( # demo_catalog, dataset_node=URIRef("https://statbel.fgov.be/node/4151") # ) @@ -179,7 +179,7 @@ def test_catalog_as_dataframe(demo_catalog_df): # # Convert the demo catalog to a DataFrame # with build_asset_context() as context: - # catalog_df = be.census_tables.catalog_as_dataframe(context, demo_catalog_df) + # catalog_df = bel.census_tables.catalog_as_dataframe(context, demo_catalog_df) # # Check that the catalog has been converted to a DataFrame # assert isinstance(catalog_df, pd.DataFrame) @@ -228,7 +228,7 @@ def test_filter_known_failing_datasets(): "2676", ] - actual_list = be.census_tables.filter_known_failing_datasets(mock_catalog) + actual_list = bel.census_tables.filter_known_failing_datasets(mock_catalog) assert mock_catalog != expected_list assert actual_list != mock_catalog diff --git a/tests/test_cloud_outputs.py b/tests/test_cloud_outputs.py index 7e65cfd..73df5d2 100644 --- a/tests/test_cloud_outputs.py +++ b/tests/test_cloud_outputs.py @@ -9,7 +9,7 @@ # ) # generate_pmtiles, # TODO, Move this to a fixture to somewhere more universal -from .test_be import demo_sectors # noqa: F401 +from .test_bel import demo_sectors # noqa: F401 # Commented out test as part of #92 as functions no longer importable # @pytest.mark.skip( diff --git a/tests/test_metadata.py b/tests/test_metadata.py index bc272b2..502f186 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -57,7 +57,7 @@ def test_source_data_release_hash(): ) assert ( source_data_release.id - == "9ec7e234d73664339e4c1f04bfa485dbb17e204dd72dc3ffbb9cab6870475597" + == "4d61bfe401ba17becd02d6b3912152c135daa9ecaebc9bd45a589dc831a85217" ) source_data_release2 = SourceDataRelease(