diff --git a/dragnet/compat.py b/dragnet/compat.py index b6f67f41..2a99f152 100644 --- a/dragnet/compat.py +++ b/dragnet/compat.py @@ -271,16 +271,27 @@ def bytes_block_list_cast(blocks, **kwargs): from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV +try: + from sklearn.externals import joblib +except: + import joblib + + # generate model paths if '0.15.2' <= sklearn_version <= '0.17.1': if PY2: model_path = 'py2_sklearn_0.15.2_0.17.1' else: model_path = 'py3_sklearn_0.15.2_0.17.1' -elif sklearn_version >= '0.18.0': +elif sklearn_version >= '0.18.0' and sklearn_version < '1.0.0': if PY2: model_path = 'py2_sklearn_0.18.0' else: model_path = 'py3_sklearn_0.18.0' +elif sklearn_version >= '1.0.0': + if PY2: + raise Exception('incompatible scikit-learn version: "{}" with Python 2.'.format(sklearn_version)) + else: + model_path = 'py3_sklearn_1.1.2' else: raise Exception('incompatible scikit-learn version: "{}"'.format(sklearn_version)) diff --git a/dragnet/model_training.py b/dragnet/model_training.py index ee8d8889..fdc7c705 100644 --- a/dragnet/model_training.py +++ b/dragnet/model_training.py @@ -6,12 +6,11 @@ import pprint import numpy as np -from sklearn.externals import joblib from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score from sklearn.pipeline import FeatureUnion from .blocks import simple_tokenizer -from .compat import GridSearchCV, model_path, string_, train_test_split, str_cast +from .compat import GridSearchCV, model_path, string_, train_test_split, str_cast, joblib from .data_processing import prepare_all_data from .util import dameraulevenshtein diff --git a/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_comments_block_errors.txt b/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_comments_block_errors.txt new file mode 100644 index 00000000..4b83d2ed --- /dev/null +++ b/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_comments_block_errors.txt @@ -0,0 +1,10 @@ +Training errors for final model (block level): +{'accuracy': 0.8867353503689139, + 'f1': 0.8091648441058558, + 'precision': 0.8137229534106326, + 'recall': 0.8046575153003008} +Test errors for final model (block level): +{'accuracy': 0.9110456517753468, + 'f1': 0.8077473755395531, + 'precision': 0.803548716008115, + 'recall': 0.8119901427600272} \ No newline at end of file diff --git a/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_comments_content_block_errors.txt b/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_comments_content_block_errors.txt new file mode 100644 index 00000000..4263eb36 --- /dev/null +++ b/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_comments_content_block_errors.txt @@ -0,0 +1,10 @@ +Training errors for final model (block level): +{'accuracy': 0.9204016804789045, + 'f1': 0.9404366736699382, + 'precision': 0.9170907130454687, + 'recall': 0.9650022976848844} +Test errors for final model (block level): +{'accuracy': 0.8950754751573672, + 'f1': 0.915453950945091, + 'precision': 0.8719096486126484, + 'recall': 0.9635762001318621} \ No newline at end of file diff --git a/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_comments_content_model.pkl.gz b/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_comments_content_model.pkl.gz new file mode 100644 index 00000000..f9969713 Binary files /dev/null and b/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_comments_content_model.pkl.gz differ diff --git a/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_comments_model.pkl.gz b/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_comments_model.pkl.gz new file mode 100644 index 00000000..da5c1fe2 Binary files /dev/null and b/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_comments_model.pkl.gz differ diff --git a/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_content_block_errors.txt b/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_content_block_errors.txt new file mode 100644 index 00000000..25f5885e --- /dev/null +++ b/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_content_block_errors.txt @@ -0,0 +1,10 @@ +Training errors for final model (block level): +{'accuracy': 0.9052602415469156, + 'f1': 0.86484139122908, + 'precision': 0.8726660744895444, + 'recall': 0.8571557796569614} +Test errors for final model (block level): +{'accuracy': 0.9226621035262482, + 'f1': 0.8950188977266318, + 'precision': 0.8746076422215882, + 'recall': 0.9164056182600246} \ No newline at end of file diff --git a/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_content_model.pkl.gz b/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_content_model.pkl.gz new file mode 100644 index 00000000..a9ca2ab7 Binary files /dev/null and b/dragnet/pickled_models/py3_sklearn_1.1.2/kohlschuetter_readability_weninger_content_model.pkl.gz differ diff --git a/dragnet/util.py b/dragnet/util.py index 8d83eb48..57e04f4f 100644 --- a/dragnet/util.py +++ b/dragnet/util.py @@ -10,10 +10,9 @@ import os import pkgutil -from sklearn.externals import joblib from sklearn.pipeline import FeatureUnion, make_union -from .compat import model_path, range_, string_, PY2 +from .compat import model_path, range_, string_, PY2, joblib from .features import get_feature diff --git a/requirements.txt b/requirements.txt index b944b99f..1a1b88d9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,5 @@ lxml>=4.2.3 numpy>=1.11.0 pytest>=4.0.0 pytest-cov>=2.6.0 -scikit-learn>=0.15.2,<0.21.0 +scikit-learn>=0.15.2 scipy>=0.17.0 diff --git a/scripts/train_and_test_models.py b/scripts/train_and_test_models.py index 84c64100..85e7edb4 100644 --- a/scripts/train_and_test_models.py +++ b/scripts/train_and_test_models.py @@ -4,7 +4,8 @@ from sklearn.ensemble import ExtraTreesClassifier -from dragnet.model_training import evaluate_models_tokens, train_models +from dragnet.model_training import train_model +from dragnet.extractor import Extractor MODEL = ExtraTreesClassifier( @@ -29,7 +30,7 @@ def main(): help='directory to which models, training errors, etc. will be saved') parser.add_argument( '--content_or_comments', type=str, required=True, - choices=['content', 'both'], + choices=['content', 'comments', 'both'], help="""type of information to be extracted by the model: just "content", or "both" content and comments""") parser.add_argument( @@ -40,18 +41,15 @@ def main(): be one of the features known by `dragnet.AllFeatures`""") args = vars(parser.parse_args()) - # train the model - dragnet_model = train_models( - args['data_dir'], args['output_dir'], args['features'], MODEL, - content_or_comments=args['content_or_comments']) - - # and evaluate it - figname_prefix = '_'.join(args['features']) + \ - '_content_' if args['content_or_comments'] == 'content' else '_content_comments_' - evaluate_models_tokens( - args['data_dir'], dragnet_model, - content_or_comments=args['content_or_comments'], - figname_root=os.path.join(args['output_dir'], figname_prefix)) + # train and evaluate model + if args['content_or_comments'] == 'content': + to_extract = 'content' + elif args['content_or_comments'] == 'comments': + to_extract = 'comments' + elif args['content_or_comments'] == 'both': + to_extract = ['content', 'comments'] + extractor = Extractor(features=args['features'], model=MODEL, to_extract=to_extract) + trained_extractor = train_model(extractor, args['data_dir'], args['output_dir']) if __name__ == '__main__': diff --git a/setup.py b/setup.py index 71ed060d..e7b2b2b1 100644 --- a/setup.py +++ b/setup.py @@ -108,7 +108,7 @@ def find_libxml2_include(): 'ftfy>=4.1.0,<5.0.0', 'lxml', 'numpy>=1.11.0', - 'scikit-learn>=0.15.2,<0.21.0', + 'scikit-learn>=0.15.2', 'scipy>=0.17.0', ] )