Skip to content

Commit

Permalink
Support Snowflake (spotify#5500)
Browse files Browse the repository at this point in the history
  • Loading branch information
turb committed Sep 25, 2024
1 parent b2c4ff1 commit b53021e
Show file tree
Hide file tree
Showing 9 changed files with 479 additions and 2 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,11 @@ jobs:

- name: Make target directories
if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main')
run: mkdir -p scio-bom/target scio-tensorflow/target site/target scio-cassandra/cassandra3/target scio-elasticsearch/es8/target scio-jdbc/target scio-macros/target scio-grpc/target scio-elasticsearch/common/target scio-test/target scio-avro/target scio-elasticsearch/es7/target scio-redis/target scio-extra/target scio-test/parquet/target scio-test/core/target scio-google-cloud-platform/target scio-smb/target scio-test/google-cloud-platform/target scio-neo4j/target scio-parquet/target scio-core/target scio-repl/target project/target
run: mkdir -p scio-bom/target scio-tensorflow/target site/target scio-cassandra/cassandra3/target scio-elasticsearch/es8/target scio-jdbc/target scio-macros/target scio-grpc/target scio-elasticsearch/common/target scio-test/target scio-avro/target scio-elasticsearch/es7/target scio-snowflake/target scio-redis/target scio-extra/target scio-test/parquet/target scio-test/core/target scio-google-cloud-platform/target scio-smb/target scio-test/google-cloud-platform/target scio-neo4j/target scio-parquet/target scio-core/target scio-repl/target project/target

- name: Compress target directories
if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main')
run: tar cf targets.tar scio-bom/target scio-tensorflow/target site/target scio-cassandra/cassandra3/target scio-elasticsearch/es8/target scio-jdbc/target scio-macros/target scio-grpc/target scio-elasticsearch/common/target scio-test/target scio-avro/target scio-elasticsearch/es7/target scio-redis/target scio-extra/target scio-test/parquet/target scio-test/core/target scio-google-cloud-platform/target scio-smb/target scio-test/google-cloud-platform/target scio-neo4j/target scio-parquet/target scio-core/target scio-repl/target project/target
run: tar cf targets.tar scio-bom/target scio-tensorflow/target site/target scio-cassandra/cassandra3/target scio-elasticsearch/es8/target scio-jdbc/target scio-macros/target scio-grpc/target scio-elasticsearch/common/target scio-test/target scio-avro/target scio-elasticsearch/es7/target scio-snowflake/target scio-redis/target scio-extra/target scio-test/parquet/target scio-test/core/target scio-google-cloud-platform/target scio-smb/target scio-test/google-cloud-platform/target scio-neo4j/target scio-parquet/target scio-core/target scio-repl/target project/target

- name: Upload target directories
if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main')
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ Scio includes the following artifacts:
- `scio-redis`: add-on for Redis
- `scio-repl`: extension of the Scala REPL with Scio specific operations
- `scio-smb`: add-on for Sort Merge Bucket operations
- `scio-snowflake`: add-on for Snowflake IO
- `scio-tensorflow`: add-on for TensorFlow TFRecords IO and prediction
- `scio-test`: all following test utilities. Add to your project as a "test" dependency
- `scio-test-core`: test core utilities
Expand Down
17 changes: 17 additions & 0 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,7 @@ lazy val scio = project
`scio-redis`,
`scio-repl`,
`scio-smb`,
`scio-snowflake`,
`scio-tensorflow`,
`scio-test-core`,
`scio-test-google-cloud-platform`,
Expand Down Expand Up @@ -1265,6 +1266,22 @@ lazy val `scio-parquet` = project
)
)

lazy val `scio-snowflake` = project
.in(file("scio-snowflake"))
.dependsOn(
`scio-core` % "compile;test->test"
)
.settings(commonSettings)
.settings(
description := "Scio add-on for Neo4J",
libraryDependencies ++= Seq(
// compile
"org.apache.beam" % "beam-sdks-java-core" % beamVersion,
"org.apache.beam" % "beam-sdks-java-io-snowflake" % beamVersion,
"com.nrinaudo" %% "kantan.csv" % kantanCsvVersion,
)
)

val tensorFlowMetadataSourcesDir =
settingKey[File]("Directory containing TensorFlow metadata proto files")
val tensorFlowMetadata = taskKey[Seq[File]]("Retrieve TensorFlow metadata proto files")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
/*
* Copyright 2024 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package com.spotify.scio.snowflake

import com.spotify.scio.ScioContext
import com.spotify.scio.coders.{Coder, CoderMaterializer}
import com.spotify.scio.io.{EmptyTap, EmptyTapOf, ScioIO, Tap, TapT}
import com.spotify.scio.values.SCollection
import kantan.csv.{RowDecoder, RowEncoder}
import org.apache.beam.sdk.io.snowflake.SnowflakeIO.{CsvMapper, UserDataMapper}
import org.apache.beam.sdk.io.{snowflake => beam}

object SnowflakeIO {

private[snowflake] def dataSourceConfiguration(connectionOptions: SnowflakeConnectionOptions) = {

val datasourceInitial = beam.SnowflakeIO.DataSourceConfiguration
.create()

val datasourceWithAuthent = connectionOptions.authenticationOptions match {
case SnowflakeUsernamePasswordAuthenticationOptions(username, password) =>
datasourceInitial.withUsernamePasswordAuth(username, password)
case SnowflakeKeyPairAuthenticationOptions(username, privateKeyPath, None) =>
datasourceInitial.withKeyPairPathAuth(username, privateKeyPath)
case SnowflakeKeyPairAuthenticationOptions(username, privateKeyPath, Some(passphrase)) =>
datasourceInitial.withKeyPairPathAuth(username, privateKeyPath, passphrase)
case SnowflakeOAuthTokenAuthenticationOptions(token) =>
datasourceInitial.withOAuth(token)
}

val datasourceBeforeSchema = datasourceWithAuthent
.withServerName(connectionOptions.serverName)
.withDatabase(connectionOptions.database)
.withRole(connectionOptions.role)
.withWarehouse(connectionOptions.warehouse)

connectionOptions.schema
.map(schema => datasourceBeforeSchema.withSchema(schema))
.getOrElse(datasourceBeforeSchema)
}

private[snowflake] def buildCsvMapper[T](rowDecoder: RowDecoder[T]): CsvMapper[T] =
new CsvMapper[T] {
override def mapRow(parts: Array[String]): T = {
val unsnowedParts = parts.map {
case "\\N" => "" // needs to be mapped to an Option
case other => other
}.toSeq
rowDecoder.unsafeDecode(unsnowedParts)
}
}

private[snowflake] def prepareRead[T](
snowflakeOptions: SnowflakeOptions,
sc: ScioContext
)(implicit rowDecoder: RowDecoder[T], coder: Coder[T]): beam.SnowflakeIO.Read[T] =
beam.SnowflakeIO
.read()
.withDataSourceConfiguration(
SnowflakeIO.dataSourceConfiguration(snowflakeOptions.connectionOptions)
)
.withStagingBucketName(snowflakeOptions.stagingBucketName)
.withStorageIntegrationName(snowflakeOptions.storageIntegrationName)
.withCsvMapper(buildCsvMapper(rowDecoder))
.withCoder(CoderMaterializer.beam(sc, coder))
}

sealed trait SnowflakeIO[T] extends ScioIO[T]

final case class SnowflakeSelect[T](snowflakeOptions: SnowflakeOptions, select: String)(implicit
rowDecoder: RowDecoder[T],
coder: Coder[T]
) extends SnowflakeIO[T] {

override type ReadP = Unit
override type WriteP = Unit
override val tapT: TapT.Aux[T, Nothing] = EmptyTapOf[T]

override protected def read(sc: ScioContext, params: ReadP): SCollection[T] =
sc.applyTransform(SnowflakeIO.prepareRead(snowflakeOptions, sc).fromQuery(select))

override protected def write(data: SCollection[T], params: WriteP): Tap[Nothing] =
throw new UnsupportedOperationException("SnowflakeSelect is read-only")

override def tap(params: ReadP): Tap[Nothing] = EmptyTap
}

final case class SnowflakeTable[T](snowflakeOptions: SnowflakeOptions, table: String)(implicit
rowDecoder: RowDecoder[T],
rowEncoder: RowEncoder[T],
coder: Coder[T]
) extends SnowflakeIO[T] {

override type ReadP = Unit
override type WriteP = Unit
override val tapT: TapT.Aux[T, Nothing] = EmptyTapOf[T]

override protected def read(sc: ScioContext, params: ReadP): SCollection[T] =
sc.applyTransform(SnowflakeIO.prepareRead(snowflakeOptions, sc).fromTable(table))

override protected def write(data: SCollection[T], params: WriteP): Tap[Nothing] = {
data.applyInternal(
beam.SnowflakeIO
.write[T]()
.withDataSourceConfiguration(
SnowflakeIO.dataSourceConfiguration(snowflakeOptions.connectionOptions)
)
.to(table)
.withStagingBucketName(snowflakeOptions.stagingBucketName)
.withStorageIntegrationName(snowflakeOptions.storageIntegrationName)
.withUserDataMapper(new UserDataMapper[T] {
override def mapRow(element: T): Array[AnyRef] = rowEncoder.encode(element).toArray
})
)
EmptyTap
}

override def tap(params: ReadP): Tap[Nothing] = EmptyTap
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Copyright 2024 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package com.spotify.scio.snowflake

trait SnowflakeAuthenticationOptions

/**
* Options for a Snowflake username/password authentication.
*
* @param username
* username
* @param password
* password
*/
final case class SnowflakeUsernamePasswordAuthenticationOptions(
username: String,
password: String
) extends SnowflakeAuthenticationOptions

/**
* Options for a Snowflake key pair authentication.
*
* @param username
* username
* @param privateKeyPath
* path to the private key
* @param privateKeyPassphrase
* passphrase for the private key (optional)
*/
final case class SnowflakeKeyPairAuthenticationOptions(
username: String,
privateKeyPath: String,
privateKeyPassphrase: Option[String]
) extends SnowflakeAuthenticationOptions

/**
* Options for a Snowflake OAuth token authentication.
*
* @param token
* OAuth token
*/
final case class SnowflakeOAuthTokenAuthenticationOptions(
token: String
) extends SnowflakeAuthenticationOptions

/**
* Options for a Snowflake connection.
*
* @param authenticationOptions
* authentication options
* @param serverName
* server name (e.g. "account.region.snowflakecomputing.com")
* @param database
* database name
* @param role
* role name
* @param warehouse
* warehouse name
* @param schema
* schema name (optional)
*/
final case class SnowflakeConnectionOptions(
authenticationOptions: SnowflakeAuthenticationOptions,
serverName: String,
database: String,
role: String,
warehouse: String,
schema: Option[String]
)

/**
* Options for configuring a Neo4J driver.
*
* @param connectionOptions
* connection options
* @param stagingBucketName
* Snowflake staging bucket name where CSV files will be stored
* @param storageIntegrationName
* Storage integration name as created in Snowflake
*/
final case class SnowflakeOptions(
connectionOptions: SnowflakeConnectionOptions,
stagingBucketName: String,
storageIntegrationName: String
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/*
* Copyright 2024 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package com.spotify.scio

import com.spotify.scio.snowflake.syntax.AllSyntax

/**
* Main package for Snowflake APIs. Import all.
*
* {{{
* import com.spotify.scio.snowflake._
* }}}
*/
package object snowflake extends AllSyntax
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*
* Copyright 2024 Spotify AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package com.spotify.scio.snowflake.syntax

trait AllSyntax extends ScioContextSyntax with SCollectionSyntax
Loading

0 comments on commit b53021e

Please sign in to comment.