Skip to content

Commit

Permalink
WIP: start working on an api
Browse files Browse the repository at this point in the history
  • Loading branch information
akshaykarle committed May 24, 2024
1 parent aafbc8b commit fb6f5a7
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 2 deletions.
13 changes: 13 additions & 0 deletions analyzer_engine/csv_analyzer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,16 @@ def analyze_csv(
csv_dict = {header: list(map(str, values)) for header, *values in zip(*csv_list)}
analyzer_results = self.analyze_dict(csv_dict, language, keys_to_skip)
return list(analyzer_results)

def analyze_text(
self,
text: str,
language: str,
keys_to_skip: Optional[List[str]] = None,
**kwargs,
) -> Iterable[DictAnalyzerResult]:
d = text.split('\\n')
csv_list = csv.DictReader(d)
csv_dict = {header: list(map(str, values)) for header, *values in zip(*csv_list)}
analyzer_results = self.analyze_dict(csv_dict, language, keys_to_skip)
return list(analyzer_results)
74 changes: 74 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import json
import logging
import os
from typing import Tuple

from flask import Flask, request, jsonify, Response

from analyzer_engine.csv_analyzer_engine import CSVAnalyzerEngine
from config.nlp_engine_config import FlairNLPEngine

DEFAULT_PORT = "3000"
NLP_ENGINE = "flair/ner-english-large"

class Server:
"""HTTP Server for calling Presidio Analyzer."""

def __init__(self):
self.logger = logging.getLogger("pii-detection-anonymizer")
self.logger.setLevel(os.environ.get("LOG_LEVEL", self.logger.level))
self.app = Flask(__name__)
self.logger.info("Starting analyzer engine")
nlp_engine = FlairNLPEngine(NLP_ENGINE)
self.engine = CSVAnalyzerEngine(nlp_engine)
self.logger.info("Started analyzer engine")

@self.app.route("/health")
def health() -> str:
"""Return basic health probe result."""
return "PII detection and anonymizer service is up"

@self.app.route("/analyze", methods=["POST"])
def analyze() -> Tuple[str, int]:
"""Execute the analyzer function."""
# Parse the request params
try:
req_json = request.get_json()
if not req_json.get("text"):
raise Exception("No text provided")

if not req_json.get("language"):
raise Exception("No language provided")

recognizer_result_list = self.engine.analyze_text(
text=req_json.get("text"),
language=req_json.get("language")
)

return Response(
json.dumps(
recognizer_result_list,
default=lambda o: o.to_dict(),
sort_keys=True,
),
content_type="application/json",
)
except TypeError as te:
error_msg = (
f"Failed to parse /analyze request "
f"for AnalyzerEngine.analyze(). {te.args[0]}"
)
self.logger.error(error_msg)
return jsonify(error=error_msg), 400

except Exception as e:
self.logger.error(
f"A fatal error occurred during execution of "
f"AnalyzerEngine.analyze(). {e}"
)
return jsonify(error=e.args[0]), 500

if __name__ == "__main__":
port = int(os.environ.get("PORT", DEFAULT_PORT))
server = Server()
server.app.run(host="0.0.0.0", port=port)
63 changes: 62 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ spacy = "^3.7.4"
scipy = "<1.13.0"
presidio-anonymizer = "^2.2.354"
presidio-analyzer = {version = "^2.2.354", extras = ["transformers", "stanza"]}
flask = "^3.0.3"


[build-system]
Expand Down
14 changes: 13 additions & 1 deletion tests/analyzer_engine/csv_analyzer_engine_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,16 @@ def test_csv_analyzer_engine_anonymizer(self):
anonymizer = BatchAnonymizerEngine()
anonymized_results = anonymizer.anonymize_dict(analyzer_results)
pprint.pprint(anonymized_results)
self.assertIsNotNone(anonymized_results)
self.assertisnotnone(anonymized_results)

def test_csv_analyzer_engine_text_data(self):
import pprint
from presidio_anonymizer import BatchAnonymizerEngine
analyzer_results = self.csv_analyser.analyze_text('id,name,city,comments\n1,John Smith,LA,drivers license is AC432223', language="en")

pprint.pprint(analyzer_results)

anonymizer = BatchAnonymizerEngine()
anonymized_results = anonymizer.anonymize_dict(analyzer_results)
pprint.pprint(anonymized_results)
self.assertisnotnone(anonymized_results)

0 comments on commit fb6f5a7

Please sign in to comment.