diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..28c5dcd --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,51 @@ +name: Create and publish a Docker image + +on: + push: + branches: + - main + tags: + - '*' + workflow_dispatch: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: sahajsoft/pii + +jobs: + build-and-push-image: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + attestations: write + id-token: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + - name: Build and push Docker image + id: push + uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + - name: Generate artifact attestation + uses: actions/attest-build-provenance@v1 + with: + subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}} + subject-digest: ${{ steps.push.outputs.digest }} + push-to-registry: true + diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index d74f636..0d01929 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -1,6 +1,12 @@ name: "Test and Build" -on: [push] +on: + push: + branches: + - main + pull_request: + branches: + - main jobs: build: diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..cfa035f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.11 + +WORKDIR /usr/src/app +RUN pip install poetry==1.8.3 +ENV POETRY_NO_INTERACTION=1 +ENV POETRY_VIRTUALENVS_IN_PROJECT=1 +ENV POETRY_VIRTUALENVS_CREATE=1 +ENV POETRY_CACHE_DIR=/tmp/poetry_cache +COPY poetry.toml . +COPY poetry.lock . +COPY pyproject.toml . +RUN poetry install --no-root && rm -rf $POETRY_CACHE_DIR +RUN poetry run python -m spacy download en_core_web_sm +COPY src src +COPY tests tests + +ENTRYPOINT [ "poetry", "run", "python", "src/cli.py" ] diff --git a/README.md b/README.md index 88bd227..6f37cba 100644 --- a/README.md +++ b/README.md @@ -2,34 +2,33 @@ This is a python app to detect and anonymise PII data using Named Entity Recognition with Flair-based Embeddings built on top of [Presidio](https://github.com/microsoft/presidio). -## Prerequisites - -Run `./setup.sh` to install all dependencies. This will install [direnv](https://github.com/direnv/direnv/blob/master/docs/installation.md) and [nix](https://nixos.org/download.html) then simply run `direnv allow` to install all build dependencies. - -Alternatively, make sure you have [python 3.11](https://www.python.org/downloads/) and [poetry](https://python-poetry.org/docs/#installation) setup on your machine. - ## Getting Started -To get started, run the following: +You can use docker to install the app (follow instructions +[here](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry#authenticating-to-the-container-registry) +to authenticate): -``` -poetry install --no-interaction --no-root -poetry run pytest +```sh +docker pull ghcr.io/sahajsoft/pii ``` -### Demo +To use, it's best to add an alias: -You can run this as an api or using the cli. You can find the [API demo here](https://www.loom.com/share/ad8b37451ea54dcda8716cb5c6f11e94). - -You can run the API using the following command: -``` -poetry run python src/app.py +```sh +alias pii=docker run --rm -i ghcr.io/sahajsoft/pii ``` -To run the cli locally, run any of the following commands: +Then you can use `pii` as found below in sample commands. + +## Usage + +To run the CLI locally, run any of the following commands: ```sh +# alias for easier calls alias pii='poetry run python src/cli.py' +# alias for docker +alias pii=docker run --rm -i ghcr.io/sahajsoft/pii # text echo "My name is Don Stark and my phone number is 212-555-5555" | pii analyze @@ -62,7 +61,24 @@ pii --help * Detecting if PII is present in any of your files, text or structured data like json, etc. * Anonymizing/Deanonymizing PII data before sending to services like OpenAI, Anthropic, etc. for training or inference. -## Troubleshooting +## Development + +### Prerequisites + +Run `./setup.sh` to install all dependencies. This will install [direnv](https://github.com/direnv/direnv/blob/master/docs/installation.md) and [nix](https://nixos.org/download.html) then simply run `direnv allow` to install all build dependencies. + +Alternatively, make sure you have [python 3.11](https://www.python.org/downloads/) and [poetry](https://python-poetry.org/docs/#installation) setup on your machine. + +### Running the app + +To get started, run the following: + +``` +poetry install --no-interaction --no-root +poetry run pytest +``` + +### Troubleshooting There is a chance that `direnv allow` will not load the environment correctly and silently fail. This is observable when you attempt to run `poetry install`, as you will get a `command not found` error in the shell. To fix this, you need to run the nix commands directly. Run the following: diff --git a/pyproject.toml b/pyproject.toml index f6a6c62..2d1816b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pii-detection-and-anonymizer" -version = "0.1.0" +version = "0.1.1" description = "" authors = ["Akshay Karle <1443108+akshaykarle@users.noreply.github.com>"] license = "MIT"