Skip to content

Commit

Permalink
Merge pull request #45 from dayyass/develop
Browse files Browse the repository at this point in the history
release v0.1.1
  • Loading branch information
dayyass authored Aug 11, 2021
2 parents b083cf1 + 4646ecd commit 3b90e19
Show file tree
Hide file tree
Showing 20 changed files with 422 additions and 124 deletions.
17 changes: 17 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[run]
branch = True
source = text_clf

[report]
exclude_lines =
pragma: no cover
if self\.debug
raise AssertionError
raise NotImplementedError
if __name__ == .__main__.:

omit =
text_clf/__main__.py

show_missing = True
ignore_errors = False
38 changes: 38 additions & 0 deletions .github/workflows/codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# This workflow will install Python dependencies and run codecov
# https://github.com/codecov/codecov-action#example-workflowyml-with-codecov-action

name: codecov

on:
push:
branches: [main, develop]
pull_request:
branches: [main, develop]

jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest]
steps:
- uses: actions/checkout@master
- name: Set up Python
uses: actions/setup-python@master
with:
python-version: 3.7
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -r requirements.txt
pip install pytest pytest-cov
- name: Generate coverage report
run: |
pytest --cov=./ --cov-report=xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
with:
flags: unittests
env_vars: OS,PYTHON
fail_ci_if_error: true
verbose: true
41 changes: 41 additions & 0 deletions .github/workflows/linter.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# This workflow will install Python dependencies and run linter
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: linter

on:
push:
branches: [main, develop]
pull_request:
branches: [main, develop]

jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest]
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.7
- name: Install dependencies
run: |
pip install --upgrade pip
pip install isort black flake8 types-PyYAML mypy
- name: Code format check with isort
run: |
isort --check-only --profile black .
- name: Code format check with black
run: |
black --check .
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Type check with mypy
run: mypy --ignore-missing-imports .
31 changes: 31 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# This workflow will install Python dependencies and run tests with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: tests

on:
push:
branches: [main, develop]
pull_request:
branches: [main, develop]

jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: ['3.6', '3.7', '3.8', '3.9']
os: [ubuntu-latest, macOS-latest, windows-latest]
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -r requirements.txt
- name: Tests
run: |
python -m unittest discover
5 changes: 2 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@ dist

*.egg-info/

data/*
!data/README.md
!data/fetch_20newsgroups.py
data/train.csv
data/valid.csv

models/*
!models/README.md
12 changes: 12 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM python:3.7-slim-buster
MAINTAINER Dani El-Ayyass <[email protected]>

WORKDIR /workdir

COPY config.yaml ./
COPY data/train.csv data/valid.csv data/

RUN pip install --upgrade pip && \
pip install --no-cache-dir text-classification-baseline

CMD ["bash"]
52 changes: 38 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
[![tests](https://github.com/dayyass/text-classification-baseline/actions/workflows/tests.yml/badge.svg)](https://github.com/dayyass/text-classification-baseline/actions/workflows/tests.yml)
[![linter](https://github.com/dayyass/text-classification-baseline/actions/workflows/linter.yml/badge.svg)](https://github.com/dayyass/text-classification-baseline/actions/workflows/linter.yml)
[![codecov](https://codecov.io/gh/dayyass/text-classification-baseline/branch/main/graph/badge.svg?token=ABFF3YQBJV)](https://codecov.io/gh/dayyass/text-classification-baseline)

[![python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://github.com/dayyass/text-classification-baseline#requirements)
[![release (latest by date)](https://img.shields.io/github/v/release/dayyass/text-classification-baseline)](https://github.com/dayyass/text-classification-baseline/releases/latest)
[![license](https://img.shields.io/github/license/dayyass/text-classification-baseline?color=blue)](https://github.com/dayyass/text-classification-baseline/blob/main/LICENSE)

[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-black)](https://github.com/dayyass/text-classification-baseline/blob/main/.pre-commit-config.yaml)
[![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)

[![pypi version](https://img.shields.io/pypi/v/text-classification-baseline)](https://pypi.org/project/text-classification-baseline)
[![pypi downloads](https://img.shields.io/pypi/dm/text-classification-baseline)](https://pypi.org/project/text-classification-baseline)

### Text Classification Baseline
Pipeline for building text classification **TF-IDF + LogReg** baselines using **sklearn**.
Pipeline for fast building text classification **TF-IDF + LogReg** baselines.

### Usage
Instead of writing custom code for specific text classification task, you just need:
Expand All @@ -8,18 +22,16 @@ Instead of writing custom code for specific text classification task, you just n
pip install text-classification-baseline
```
2. run pipeline:
- either in **terminal**:
```shell script
text-clf-train
```
- or in **python**:
```python3
import text_clf

- either in **terminal**:
```shell script
text-clf --config config.yaml
```

- or in **python**:
```python3
import text_clf
text_clf.train(path_to_config="config.yaml")
```
text_clf.train()
```

No data preparation is needed, only a **csv** file with two raw columns (with arbitrary names):
- `text`
Expand All @@ -30,7 +42,17 @@ No data preparation is needed, only a **csv** file with two raw columns (with ar
#### Config
The user interface consists of only one file [**config.yaml**](https://github.com/dayyass/text-classification-baseline/blob/main/config.yaml).

Change **config.yaml** to create the desired configuration and train text classification model.
Change **config.yaml** to create the desired configuration and train text classification model with the following command:
- **terminal**:
```shell script
text-clf-train --path_to_config config.yaml
```
- **python**:
```python3
import text_clf

text_clf.train(path_to_config="config.yaml")
```

Default **config.yaml**:
```yaml
Expand Down Expand Up @@ -63,6 +85,8 @@ logreg:
n_jobs: -1
```
**NOTE**: `tf-idf` and `logreg` are sklearn [**TfidfVectorizer**](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html?highlight=tfidf#sklearn.feature_extraction.text.TfidfVectorizer) and [**LogisticRegression**](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) parameters correspondingly, so you can parameterize instances of these classes however you want.

#### Output
After training the model, the pipeline will return the following files:
- `model.joblib` - sklearn pipeline with TF-IDF and LogReg steps
Expand All @@ -71,7 +95,7 @@ After training the model, the pipeline will return the following files:
- `logging.txt` - logging file

### Requirements
Python >= 3.7
Python >= 3.6

### Citation
If you use **text-classification-baseline** in a scientific publication, we would appreciate references to the following BibTex entry:
Expand Down
15 changes: 15 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
codecov:
require_ci_to_pass: yes

ignore:
- "text_clf/__main__.py"

coverage:
status:
project:
default: false
source:
paths:
- "text_clf/"
target: 90%
patch: off
Empty file added data/__init__.py
Empty file.
45 changes: 0 additions & 45 deletions data/fetch_20newsgroups.py

This file was deleted.

47 changes: 47 additions & 0 deletions data/load_20newsgroups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os

import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.utils import Bunch


def make_df_from_bunch(bunch: Bunch) -> pd.DataFrame:
"""
Make pd.DataFrame from 20newsgroups bunch.
:param Bunch bunch: 20newsgroups bunch.
:return: 20newsgroups DataFrame.
:rtype: pd.DataFrame
"""

df = pd.DataFrame(
{
"text": bunch.data,
"target": bunch.target,
}
)
df["target_name"] = df["target"].map(lambda x: bunch.target_names[x])
df["target_name_short"] = df["target_name"].map(lambda x: x.split(".")[0])

return df


def load_20newsgroups() -> None:
"""
Load 20newsgroups dataset.
"""

train_bunch = fetch_20newsgroups(subset="train")
test_bunch = fetch_20newsgroups(subset="test")

df_train = make_df_from_bunch(train_bunch)
df_valid = make_df_from_bunch(test_bunch)

os.makedirs("data", exist_ok=True)

df_train.to_csv("data/train.csv", index=False)
df_valid.to_csv("data/valid.csv", index=False)


if __name__ == "__main__":
load_20newsgroups()
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
pandas==1.3.1
PyYAML==5.4.1
scikit-learn==0.24.2
pandas>=1.1.5
PyYAML>=5.4.1
scikit-learn>=0.24.2
8 changes: 4 additions & 4 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = text-classification-baseline
version = 0.1.0
version = 0.1.1
author = Dani El-Ayyass
author_email = [email protected]
description = TF-IDF + LogReg baseline for text classification
Expand All @@ -16,12 +16,12 @@ classifiers =

[options]
packages = find:
python_requires = >=3.7
python_requires = >=3.6
install_requires =
pandas >= 1.3.1
pandas >= 1.1.5
PyYAML >= 5.4.1
scikit-learn >= 0.24.2

[options.entry_points]
console_scripts =
text-clf = text_clf.__main__:main
text-clf-train = text_clf.__main__:main
Empty file added tests/__init__.py
Empty file.
Loading

0 comments on commit 3b90e19

Please sign in to comment.