diff --git a/internal/geoparquet/geoparquet.go b/internal/geoparquet/geoparquet.go index 67eeaa2..9a784b7 100644 --- a/internal/geoparquet/geoparquet.go +++ b/internal/geoparquet/geoparquet.go @@ -32,6 +32,7 @@ func getMetadata(fileReader *file.Reader, convertOptions *ConvertOptions) *Metad primaryColumn = convertOptions.InputPrimaryColumn } metadata = &Metadata{ + Version: Version, PrimaryColumn: primaryColumn, Columns: map[string]*GeometryColumn{ primaryColumn: getDefaultGeometryColumn(), diff --git a/internal/pqutil/transform.go b/internal/pqutil/transform.go index 2061a7d..c4599b0 100644 --- a/internal/pqutil/transform.go +++ b/internal/pqutil/transform.go @@ -175,9 +175,6 @@ func TransformByColumn(config *TransformConfig) error { if err != nil { return err } - if transformed.DataType() != outputField.Type { - return fmt.Errorf("transform generated an unexpected type, got %s, expected %s", transformed.DataType().Name(), outputField.Type.Name()) - } arr = transformed } colWriter, colWriterErr := pqarrow.NewArrowColumnWriter(arr, 0, int64(arr.Len()), outputManifest, rowGroupWriter, fieldNum) diff --git a/internal/validator/rules.go b/internal/validator/rules.go index 779f6eb..60e7883 100644 --- a/internal/validator/rules.go +++ b/internal/validator/rules.go @@ -432,13 +432,13 @@ func GeometryUngrouped() Rule { title: "geometry columns must not be grouped", validate: func(info *FileInfo) error { metadata := info.Metadata - sc := info.File.MetaData().Schema + root := info.File.MetaData().Schema.Root() for name := range metadata.Columns { - index := sc.ColumnIndexByName(name) + index := root.FieldIndexByName(name) if index < 0 { return fatal("missing geometry column %q", name) } - _, ok := sc.Root().Field(index).(*schema.PrimitiveNode) + _, ok := root.Field(index).(*schema.PrimitiveNode) if !ok { return fmt.Errorf("column %q must not be a group", name) } @@ -454,14 +454,14 @@ func GeometryDataType() Rule { title: "geometry columns must be stored using the BYTE_ARRAY parquet type", validate: func(info *FileInfo) error { metadata := info.Metadata - sc := info.File.MetaData().Schema + root := info.File.MetaData().Schema.Root() for name := range metadata.Columns { - index := sc.ColumnIndexByName(name) + index := root.FieldIndexByName(name) if index < 0 { return fatal("missing geometry column %q", name) } - field, ok := sc.Root().Field(index).(*schema.PrimitiveNode) + field, ok := root.Field(index).(*schema.PrimitiveNode) if !ok { return fatal("expected primitive column for %q", name) } @@ -480,14 +480,14 @@ func GeometryRepetition() Rule { title: "geometry columns must be required or optional, not repeated", validate: func(info *FileInfo) error { metadata := info.Metadata - sc := info.File.MetaData().Schema + root := info.File.MetaData().Schema.Root() for name := range metadata.Columns { - index := sc.ColumnIndexByName(name) + index := root.FieldIndexByName(name) if index < 0 { return fatal("missing geometry column %q", name) } - repetitionType := sc.Root().Field(index).RepetitionType() + repetitionType := root.Field(index).RepetitionType() if repetitionType == parquet.Repetitions.Repeated { return fmt.Errorf("column %q must not be repeated", name) } diff --git a/internal/validator/testdata/complex-types/expected.json b/internal/validator/testdata/complex-types/expected.json new file mode 100644 index 0000000..313c688 --- /dev/null +++ b/internal/validator/testdata/complex-types/expected.json @@ -0,0 +1,105 @@ +{ + "checks": [ + { + "title": "file must include a \"geo\" metadata key", + "run": true, + "passed": true + }, + { + "title": "metadata must be a JSON object", + "run": true, + "passed": true + }, + { + "title": "metadata must include a \"version\" string", + "run": true, + "passed": true + }, + { + "title": "metadata must include a \"primary_column\" string", + "run": true, + "passed": true + }, + { + "title": "metadata must include a \"columns\" object", + "run": true, + "passed": true + }, + { + "title": "column metadata must include the \"primary_column\" name", + "run": true, + "passed": true + }, + { + "title": "column metadata must include a valid \"encoding\" string", + "run": true, + "passed": true + }, + { + "title": "column metadata must include a \"geometry_types\" list", + "run": true, + "passed": true + }, + { + "title": "optional \"crs\" must be null or a PROJJSON object", + "run": true, + "passed": true + }, + { + "title": "optional \"orientation\" must be a valid string", + "run": true, + "passed": true + }, + { + "title": "optional \"edges\" must be a valid string", + "run": true, + "passed": true + }, + { + "title": "optional \"bbox\" must be an array of 4 or 6 numbers", + "run": true, + "passed": true + }, + { + "title": "optional \"epoch\" must be a number", + "run": true, + "passed": true + }, + { + "title": "geometry columns must not be grouped", + "run": true, + "passed": true + }, + { + "title": "geometry columns must be stored using the BYTE_ARRAY parquet type", + "run": true, + "passed": true + }, + { + "title": "geometry columns must be required or optional, not repeated", + "run": true, + "passed": true + }, + { + "title": "all geometry values match the \"encoding\" metadata", + "run": true, + "passed": true + }, + { + "title": "all geometry types must be included in the \"geometry_types\" metadata (if not empty)", + "run": true, + "passed": true + }, + { + "title": "all polygon geometries must follow the \"orientation\" metadata (if present)", + "run": true, + "passed": true + }, + { + "title": "all geometries must fall within the \"bbox\" metadata (if present)", + "run": true, + "passed": true + } + ], + "metadataOnly": false +} \ No newline at end of file diff --git a/internal/validator/testdata/complex-types/input.json b/internal/validator/testdata/complex-types/input.json new file mode 100644 index 0000000..17aefcc --- /dev/null +++ b/internal/validator/testdata/complex-types/input.json @@ -0,0 +1,85 @@ +{ + "metadata": { + "version": "1.0.0", + "primary_column": "geometry", + "columns": { + "geometry": { + "encoding": "WKB", + "geometry_types": [ + "Point" + ], + "orientation": "counterclockwise", + "edges": "planar", + "bbox": [ + 0, + 0, + 0, + 0 + ], + "epoch": 2021.47, + "crs": { + "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", + "type": "GeographicCRS", + "name": "WGS 84 longitude-latitude", + "datum": { + "type": "GeodeticReferenceFrame", + "name": "World Geodetic System 1984", + "ellipsoid": { + "name": "WGS 84", + "semi_major_axis": 6378137, + "inverse_flattening": 298.257223563 + } + }, + "coordinate_system": { + "subtype": "ellipsoidal", + "axis": [ + { + "name": "Geodetic longitude", + "abbreviation": "Lon", + "direction": "east", + "unit": "degree" + }, + { + "name": "Geodetic latitude", + "abbreviation": "Lat", + "direction": "north", + "unit": "degree" + } + ] + }, + "id": { + "authority": "OGC", + "code": "CRS84" + } + } + } + } + }, + "data": { + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "properties": { + "numbers": [2, 4, 6, 8], + "strings": ["chicken", "soup"], + "object": { + "name": "Bob" + }, + "names": { + "common": [ + {"value": "Hello", "language": "en"} + ] + } + }, + "geometry": { + "type": "Point", + "coordinates": [ + 0, + 0 + ] + } + } + ] + } +} \ No newline at end of file diff --git a/internal/validator/validator_test.go b/internal/validator/validator_test.go index bc12586..de9a620 100644 --- a/internal/validator/validator_test.go +++ b/internal/validator/validator_test.go @@ -28,11 +28,15 @@ import ( "github.com/apache/arrow/go/v14/parquet" "github.com/apache/arrow/go/v14/parquet/file" + "github.com/paulmach/orb" + "github.com/paulmach/orb/encoding/wkb" "github.com/planetlabs/gpq/internal/geojson" "github.com/planetlabs/gpq/internal/geoparquet" "github.com/planetlabs/gpq/internal/pqutil" + "github.com/planetlabs/gpq/internal/test" "github.com/planetlabs/gpq/internal/validator" "github.com/santhosh-tekuri/jsonschema/v5" + "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" ) @@ -133,6 +137,7 @@ func (s *Suite) TearDownSuite() { func (s *Suite) TestValidCases() { cases := []string{ "example-v1.0.0-beta.1.parquet", + "example-v1.0.0.parquet", } validatorAll := validator.New(false) @@ -157,11 +162,168 @@ func (s *Suite) TestValidCases() { } } +func (s *Suite) TestConvertedWKT() { + type Row struct { + Name string `parquet:"name=name, logical=String" json:"name"` + Geometry string `parquet:"name=geometry, logical=String" json:"geometry"` + } + + rows := []*Row{ + { + Name: "test-point-1", + Geometry: "POINT (1 2)", + }, + { + Name: "test-point-2", + Geometry: "POINT (3 4)", + }, + } + + input := test.ParquetFromStructs(s.T(), rows) + + geoparquetBytes := &bytes.Buffer{} + s.Require().NoError(geoparquet.FromParquet(input, geoparquetBytes, nil)) + + filePath := "test-wkt.parquet" + ctx := context.Background() + validatorAll := validator.New(false) + validatorMeta := validator.New(true) + + allReport, allErr := validatorAll.Validate(ctx, bytes.NewReader(geoparquetBytes.Bytes()), filePath) + s.Require().NoError(allErr) + s.assertExpectedReport("all-pass", allReport) + + metaReport, metaErr := validatorMeta.Validate(ctx, bytes.NewReader(geoparquetBytes.Bytes()), filePath) + s.Require().NoError(metaErr) + s.assertExpectedReport("all-pass-meta", metaReport) +} + +func (s *Suite) TestConvertedAltPrimaryColumnWKT() { + type Row struct { + Name string `parquet:"name=name, logical=String" json:"name"` + AltGeometry string `parquet:"name=alt_geometry, logical=String" json:"alt_geometry"` + } + + rows := []*Row{ + { + Name: "test-point-1", + AltGeometry: "POINT (1 2)", + }, + { + Name: "test-point-2", + AltGeometry: "POINT (3 4)", + }, + } + + input := test.ParquetFromStructs(s.T(), rows) + + geoparquetBytes := &bytes.Buffer{} + convertOptions := &geoparquet.ConvertOptions{ + InputPrimaryColumn: "alt_geometry", + } + s.Require().NoError(geoparquet.FromParquet(input, geoparquetBytes, convertOptions)) + + filePath := "test-wkb.parquet" + ctx := context.Background() + validatorAll := validator.New(false) + validatorMeta := validator.New(true) + + allReport, allErr := validatorAll.Validate(ctx, bytes.NewReader(geoparquetBytes.Bytes()), filePath) + s.Require().NoError(allErr) + s.assertExpectedReport("all-pass", allReport) + + metaReport, metaErr := validatorMeta.Validate(ctx, bytes.NewReader(geoparquetBytes.Bytes()), filePath) + s.Require().NoError(metaErr) + s.assertExpectedReport("all-pass-meta", metaReport) +} + +func toWKB(t *testing.T, geometry orb.Geometry) []byte { + data, err := wkb.Marshal(geometry) + require.NoError(t, err) + return data +} + +func (s *Suite) TestConvertedWKB() { + type Row struct { + Name string `parquet:"name=name, logical=String" json:"name"` + Geometry []byte `parquet:"name=geometry" json:"geometry"` + } + + rows := []*Row{ + { + Name: "test-point-1", + Geometry: toWKB(s.T(), orb.Point{1, 2}), + }, + { + Name: "test-point-2", + Geometry: toWKB(s.T(), orb.Point{3, 4}), + }, + } + + input := test.ParquetFromStructs(s.T(), rows) + + geoparquetBytes := &bytes.Buffer{} + s.Require().NoError(geoparquet.FromParquet(input, geoparquetBytes, nil)) + + filePath := "test-wkb.parquet" + ctx := context.Background() + validatorAll := validator.New(false) + validatorMeta := validator.New(true) + + allReport, allErr := validatorAll.Validate(ctx, bytes.NewReader(geoparquetBytes.Bytes()), filePath) + s.Require().NoError(allErr) + s.assertExpectedReport("all-pass", allReport) + + metaReport, metaErr := validatorMeta.Validate(ctx, bytes.NewReader(geoparquetBytes.Bytes()), filePath) + s.Require().NoError(metaErr) + s.assertExpectedReport("all-pass-meta", metaReport) +} + +func (s *Suite) TestConvertedAltPrimaryColumnWKB() { + type Row struct { + Name string `parquet:"name=name, logical=String" json:"name"` + AltGeometry []byte `parquet:"name=alt_geometry" json:"alt_geometry"` + } + + rows := []*Row{ + { + Name: "test-point-1", + AltGeometry: toWKB(s.T(), orb.Point{1, 2}), + }, + { + Name: "test-point-2", + AltGeometry: toWKB(s.T(), orb.Point{3, 4}), + }, + } + + input := test.ParquetFromStructs(s.T(), rows) + + geoparquetBytes := &bytes.Buffer{} + convertOptions := &geoparquet.ConvertOptions{ + InputPrimaryColumn: "alt_geometry", + } + s.Require().NoError(geoparquet.FromParquet(input, geoparquetBytes, convertOptions)) + + filePath := "test-wkb.parquet" + ctx := context.Background() + validatorAll := validator.New(false) + validatorMeta := validator.New(true) + + allReport, allErr := validatorAll.Validate(ctx, bytes.NewReader(geoparquetBytes.Bytes()), filePath) + s.Require().NoError(allErr) + s.assertExpectedReport("all-pass", allReport) + + metaReport, metaErr := validatorMeta.Validate(ctx, bytes.NewReader(geoparquetBytes.Bytes()), filePath) + s.Require().NoError(metaErr) + s.assertExpectedReport("all-pass-meta", metaReport) +} + func (s *Suite) TestReport() { cases := []string{ "all-pass", "all-pass-meta", "all-pass-minimal", + "complex-types", "bad-metadata-type", "missing-version", "missing-primary-column",