Merge pull request #45 from dayyass/develop

release v0.1.1
dayyass · Aug 11, 2021 · 3b90e19 · 3b90e19
2 parents b083cf1 + 4646ecd
commit 3b90e19
Show file tree

Hide file tree

Showing 20 changed files with 422 additions and 124 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,17 @@
+[run]
+branch = True
+source = text_clf
+
+[report]
+exclude_lines =
+    pragma: no cover
+    if self\.debug
+    raise AssertionError
+    raise NotImplementedError
+    if __name__ == .__main__.:
+
+omit =
+    text_clf/__main__.py
+
+show_missing = True
+ignore_errors = False
diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
@@ -0,0 +1,38 @@
+# This workflow will install Python dependencies and run codecov
+# https://github.com/codecov/codecov-action#example-workflowyml-with-codecov-action
+
+name: codecov
+
+on:
+  push:
+    branches: [main, develop]
+  pull_request:
+    branches: [main, develop]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+    steps:
+    - uses: actions/checkout@master
+    - name: Set up Python
+      uses: actions/setup-python@master
+      with:
+        python-version: 3.7
+    - name: Install dependencies
+      run: |
+        pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install pytest pytest-cov
+    - name: Generate coverage report
+      run: |
+        pytest --cov=./ --cov-report=xml
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v1
+      with:
+        flags: unittests
+        env_vars: OS,PYTHON
+        fail_ci_if_error: true
+        verbose: true
diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
@@ -0,0 +1,41 @@
+# This workflow will install Python dependencies and run linter
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: linter
+
+on:
+  push:
+    branches: [main, develop]
+  pull_request:
+    branches: [main, develop]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.7
+    - name: Install dependencies
+      run: |
+        pip install --upgrade pip
+        pip install isort black flake8 types-PyYAML mypy
+    - name: Code format check with isort
+      run: |
+        isort --check-only --profile black .
+    - name: Code format check with black
+      run: |
+        black --check .
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Type check with mypy
+      run: mypy --ignore-missing-imports .
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,31 @@
+# This workflow will install Python dependencies and run tests with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: tests
+
+on:
+  push:
+    branches: [main, develop]
+  pull_request:
+    branches: [main, develop]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        python-version: ['3.6', '3.7', '3.8', '3.9']
+        os: [ubuntu-latest, macOS-latest, windows-latest]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        pip install --upgrade pip
+        pip install -r requirements.txt
+    - name: Tests
+      run: |
+        python -m unittest discover
diff --git a/.gitignore b/.gitignore
@@ -18,9 +18,8 @@ dist
 
 *.egg-info/
 
-data/*
-!data/README.md
-!data/fetch_20newsgroups.py
+data/train.csv
+data/valid.csv
 
 models/*
 !models/README.md
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,12 @@
+FROM python:3.7-slim-buster
+MAINTAINER Dani El-Ayyass <[email protected]>
+
+WORKDIR /workdir
+
+COPY config.yaml ./
+COPY data/train.csv data/valid.csv data/
+
+RUN pip install --upgrade pip && \
+    pip install --no-cache-dir text-classification-baseline
+
+CMD ["bash"]
diff --git a/README.md b/README.md
@@ -1,5 +1,19 @@
+[![tests](https://github.com/dayyass/text-classification-baseline/actions/workflows/tests.yml/badge.svg)](https://github.com/dayyass/text-classification-baseline/actions/workflows/tests.yml)
+[![linter](https://github.com/dayyass/text-classification-baseline/actions/workflows/linter.yml/badge.svg)](https://github.com/dayyass/text-classification-baseline/actions/workflows/linter.yml)
+[![codecov](https://codecov.io/gh/dayyass/text-classification-baseline/branch/main/graph/badge.svg?token=ABFF3YQBJV)](https://codecov.io/gh/dayyass/text-classification-baseline)
+
+[![python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://github.com/dayyass/text-classification-baseline#requirements)
+[![release (latest by date)](https://img.shields.io/github/v/release/dayyass/text-classification-baseline)](https://github.com/dayyass/text-classification-baseline/releases/latest)
+[![license](https://img.shields.io/github/license/dayyass/text-classification-baseline?color=blue)](https://github.com/dayyass/text-classification-baseline/blob/main/LICENSE)
+
+[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-black)](https://github.com/dayyass/text-classification-baseline/blob/main/.pre-commit-config.yaml)
+[![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+
+[![pypi version](https://img.shields.io/pypi/v/text-classification-baseline)](https://pypi.org/project/text-classification-baseline)
+[![pypi downloads](https://img.shields.io/pypi/dm/text-classification-baseline)](https://pypi.org/project/text-classification-baseline)
+
 ### Text Classification Baseline
-Pipeline for building text classification **TF-IDF + LogReg** baselines using **sklearn**.
+Pipeline for fast building text classification **TF-IDF + LogReg** baselines.
 
 ### Usage
 Instead of writing custom code for specific text classification task, you just need:
@@ -8,18 +22,16 @@ Instead of writing custom code for specific text classification task, you just n
 pip install text-classification-baseline
 ```
 2. run pipeline:
+- either in **terminal**:
+```shell script
+text-clf-train
+```
+- or in **python**:
+```python3
+import text_clf
 
-    - either in **terminal**:
-    ```shell script
-    text-clf --config config.yaml
-    ```
-
-    - or in **python**:
-    ```python3
-    import text_clf
-    
-    text_clf.train(path_to_config="config.yaml")
-    ```
+text_clf.train()
+```
 
 No data preparation is needed, only a **csv** file with two raw columns (with arbitrary names):
 - `text`
@@ -30,7 +42,17 @@ No data preparation is needed, only a **csv** file with two raw columns (with ar
 #### Config
 The user interface consists of only one file [**config.yaml**](https://github.com/dayyass/text-classification-baseline/blob/main/config.yaml).
 
-Change **config.yaml** to create the desired configuration and train text classification model.
+Change **config.yaml** to create the desired configuration and train text classification model with the following command:
+- **terminal**:
+```shell script
+text-clf-train --path_to_config config.yaml
+```
+- **python**:
+```python3
+import text_clf
+
+text_clf.train(path_to_config="config.yaml")
+```
 
 Default **config.yaml**:
 ```yaml
@@ -63,6 +85,8 @@ logreg:
   n_jobs: -1
 ```
 
+**NOTE**: `tf-idf` and `logreg` are sklearn [**TfidfVectorizer**](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html?highlight=tfidf#sklearn.feature_extraction.text.TfidfVectorizer) and [**LogisticRegression**](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) parameters correspondingly, so you can parameterize instances of these classes however you want.
+
 #### Output
 After training the model, the pipeline will return the following files:
 - `model.joblib` - sklearn pipeline with TF-IDF and LogReg steps
@@ -71,7 +95,7 @@ After training the model, the pipeline will return the following files:
 - `logging.txt` - logging file
 
 ### Requirements
-Python >= 3.7
+Python >= 3.6
 
 ### Citation
 If you use **text-classification-baseline** in a scientific publication, we would appreciate references to the following BibTex entry:

diff --git a/codecov.yml b/codecov.yml
@@ -0,0 +1,15 @@
+codecov:
+  require_ci_to_pass: yes
+
+ignore:
+  - "text_clf/__main__.py"
+
+coverage:
+  status:
+    project:
+      default: false
+      source:
+        paths:
+          - "text_clf/"
+        target: 90%
+    patch: off
diff --git a/data/__init__.py b/data/__init__.py
diff --git a/data/fetch_20newsgroups.py b/data/fetch_20newsgroups.py
diff --git a/data/load_20newsgroups.py b/data/load_20newsgroups.py
@@ -0,0 +1,47 @@
+import os
+
+import pandas as pd
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.utils import Bunch
+
+
+def make_df_from_bunch(bunch: Bunch) -> pd.DataFrame:
+    """
+    Make pd.DataFrame from 20newsgroups bunch.
+
+    :param Bunch bunch: 20newsgroups bunch.
+    :return: 20newsgroups DataFrame.
+    :rtype: pd.DataFrame
+    """
+
+    df = pd.DataFrame(
+        {
+            "text": bunch.data,
+            "target": bunch.target,
+        }
+    )
+    df["target_name"] = df["target"].map(lambda x: bunch.target_names[x])
+    df["target_name_short"] = df["target_name"].map(lambda x: x.split(".")[0])
+
+    return df
+
+
+def load_20newsgroups() -> None:
+    """
+    Load 20newsgroups dataset.
+    """
+
+    train_bunch = fetch_20newsgroups(subset="train")
+    test_bunch = fetch_20newsgroups(subset="test")
+
+    df_train = make_df_from_bunch(train_bunch)
+    df_valid = make_df_from_bunch(test_bunch)
+
+    os.makedirs("data", exist_ok=True)
+
+    df_train.to_csv("data/train.csv", index=False)
+    df_valid.to_csv("data/valid.csv", index=False)
+
+
+if __name__ == "__main__":
+    load_20newsgroups()
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
-pandas==1.3.1
-PyYAML==5.4.1
-scikit-learn==0.24.2
+pandas>=1.1.5
+PyYAML>=5.4.1
+scikit-learn>=0.24.2
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = text-classification-baseline
-version = 0.1.0
+version = 0.1.1
 author = Dani El-Ayyass
 author_email = [email protected]
 description = TF-IDF + LogReg baseline for text classification
@@ -16,12 +16,12 @@ classifiers =
 
 [options]
 packages = find:
-python_requires = >=3.7
+python_requires = >=3.6
 install_requires =
-    pandas >= 1.3.1
+    pandas >= 1.1.5
     PyYAML >= 5.4.1
     scikit-learn >= 0.24.2
 
 [options.entry_points]
 console_scripts =
-    text-clf = text_clf.__main__:main
+    text-clf-train = text_clf.__main__:main
diff --git a/tests/__init__.py b/tests/__init__.py