Top

ml_models.train_mods module

ml_models.train_gender_identification_model ~~~~~~~~~~~~~ XXXXXX :copyright: © 2019 Niels Goet @ PCC Project

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
    ml_models.train_gender_identification_model
    ~~~~~~~~~~~~~
    XXXXXX
    :copyright: © 2019 Niels Goet @ PCC Project
"""

import datetime
import os
import time
from os import listdir
from os.path import isfile, join
from typing import List

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle

from data_ingestion import constants as c
from data_ingestion import data_utils


class TrainMods:
    """
    Train Models
    """
    def __init__(self, mod: str='sgd'):
        supported_mods = ['sgd', 'gbc', 'nb']
        assert mod in supported_mods, ('Model {} not supported, please choose' 
                                      'one of the following options: {}'
                                       ).format(mod, supported_mods)

        if mod=='sgd':
            param_dist = {
                'alpha': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
                'max_iter': [1000],  # number of epochs
                'penalty': ['l2'],
                'n_jobs': [-1]
            }

            model = GridSearchCV(SGDClassifier(loss='log'),
                                 param_dist,
                                 cv=5,
                                 iid=False,
                                 n_jobs=-1
                                 )
        elif mod=='gbc':
            param_dist = {
                'loss': ['deviance'],
                'learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
                'min_samples_split': np.linspace(0.1, 0.5, 12),
                'min_samples_leaf': np.linspace(0.1, 0.5, 12),
                'max_depth': [3, 5, 8],
                'max_features': ['log2', 'sqrt'],
                'criterion': ['friedman_mse', 'mae'],
                'subsample': [0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
                'n_estimators': [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
            }

            model = RandomizedSearchCV(GradientBoostingClassifier(),
                                        param_distributions=param_dist,
                                        cv=3,
                                        n_iter=10,
                                        n_jobs=-1)

        elif mod == 'nb':
            param_dist = {
                'alpha': np.linspace(0.1, 1.5, 6),
                'fit_prior': [True, False],
            }

            model = RandomizedSearchCV(MultinomialNB(),
                                        param_distributions=param_dist,
                                        cv=10,
                                        n_iter=10,
                                        n_jobs=-1)

        self.pipeline = make_pipeline(DictVectorizer(),
                                      model
                                      )

    @staticmethod
    def featurize_name(name: str = 'x'):
        name = str(name).lower().replace('ue', 'ü'
                                ).replace('oe', 'ö'
                                ).replace('ae', 'ä'
                                ).replace('ss', 'ß'
                                ).replace('+', ' '
                                ).replace('-', ' ')

        return {
            'l1': name[0],
            'l12': name[0:2],
            'l123': name[0:3],
            'lf1': name[-1],
            'lf12': name[-2:],
            'lf123': name[-3:],
            'lf1234': name[-4:],
            'second_and_last_letter_vowel': name[-1] in 'aeiouüöäy' and name[
                -2] in 'aeiouüöäy',
            'first_and_second_letter_vowel': name[0] in 'aeiouüöäy' and name[
                1] in 'aeiouüöäy',
            'last_letter_vowel': name[-1] in 'aeiouüöäy',
            'first_letter_vowel': name[0] in 'aeiouüöäy',
            'n_char': len(name),
            'n_vowels': sum([*map(name.lower().count, 'aeiouüöäy')])
        }

    def generate_model(self, languages: List[str]):
        df_list = []
        supported_lang = data_utils.get_class_attributes(c.COUNTRIES)
        assert all(i in supported_lang for i in languages), (
            'Language not supported '
            '(supported languages: {})'
            ).format(supported_lang)
        if 'en' in languages:
            MYPATH = os.path.join(os.getcwd(), 'ml_models', 'train_data',
                                  'en')
            onlyfiles = [os.path.join(MYPATH, f) for
                         f in listdir(MYPATH) if isfile(join(MYPATH, f))]

            dfs = list(map(lambda x: pd.read_csv(x, sep=',', header=None,
                                                 names=['first_name',
                                                        'gender', 'count']
                                                 ), onlyfiles))

            dfs = pd.concat(dfs)

            # Check freq
            dfs_freq = dfs.groupby(['first_name', 'gender'], as_index=False)[
                'count'].sum()
            dfs_freq = dfs_freq.reset_index().pivot('first_name', 'gender',
                                                    'count')
            dfs_freq = dfs_freq.fillna(0)
            dfs_freq.loc[:, 'perc'] = (
                        (dfs_freq['M'] - dfs_freq['F']) / (dfs_freq['M']
                                                           + dfs_freq['F']))
            dfs_freq.loc[:, 'gender'] = np.where(dfs_freq['perc'] > 0.001, 'M', 'F')

            final_df = dfs_freq.reset_index()[['first_name', 'gender']]
            final_df.loc[:, 'poli_dat'] = 0
            final_df.loc[:, 'language'] = ['en' for i in
                                           range(len(final_df.index))]

            df_list.append(final_df)

        # Load in extra tmp
        language_mapping = {'de': 'Germany',
                            'nl': 'the Netherlands',
                            'ch': 'Swiss',
                            'ie': 'Ireland',
                            'fr': 'France',
                            'be': 'Belgium',
                            'lu': 'Luxembourg',
                            'at': 'Austria',
                            'dn': 'Denmark',
                            'no': 'Norway',
                            'se': 'Sweden',
                            'en': 'Great Britain',
                            'it': 'Italy'}

        col_names = [language_mapping[i] for i in languages]

        df_other = pd.read_csv(
            os.path.join(os.getcwd(), 'ml_models', 'train_data',
                         'other', 'names.csv'),
            sep=';', header=0,
            usecols=['name', 'gender'].append(col_names))

        # Exclude if the name does not exist in any of the countries of interest
        condition = df_other.sum(axis=1) != 0.00
        df_other = df_other[condition]

        df_other.rename(columns={'name': 'first_name'}, inplace=True)
        condition = (df_other['gender'] == 'F') | (df_other['gender'] == 'M')

        other_final_df = df_other[condition][['first_name', 'gender']]

        # final_df = pd.concat([dfs, df_other], sort=True)
        other_final_df.drop_duplicates(inplace=True)
        other_final_df.dropna(how='any', inplace=True)
        other_final_df.loc[:, 'poli_dat'] = 0

        df_list.append(other_final_df)

        # Combine dfs
        final_df = pd.concat(df_list)

        # Subset on names of length > 2 char
        condition = final_df['first_name'].str.len() >= 2
        final_df = final_df[condition]

        features = np.vectorize(self.featurize_name)

        # Extract the features for the whole dataset
        X = features(final_df.loc[:, 'first_name'])  # X contains the features

        # Get the gender column
        y = final_df.loc[:, 'gender']

        X, y = shuffle(X, y, random_state=1234)
        train_prop = .7
        X_train, X_test = X[:int(train_prop * len(X))], X[int(
            train_prop * len(X)):]
        y_train, y_test = y[:int(train_prop * len(y))], y[int(
            train_prop * len(y)):]

        self.pipeline.fit(X_train, y_train)

        # Store the model
        filename = os.path.join('ml_models',
                                'trained_models',
                                'gender_model_' + '_'.join(
                                    languages) + '.pkl')

        joblib.dump(self.pipeline, filename)

        filename = os.path.join(os.getcwd(), 'ml_models', 'model_report.txt')
        if os.path.exists(filename):
            append_write = 'a'  # append if already exists
        else:
            append_write = 'w+'

        pred_x_train = self.pipeline.predict(X_train)
        pred_x_test = self.pipeline.predict(X_test)

        # Write out metrics on model performance
        with open(filename, append_write) as writefile:
            writefile.write(
                '================================================')
            writefile.write('\nModel run: {}'.format(
                datetime.datetime.fromtimestamp(time.time()).strftime(
                    '%Y-%m-%d %H:%M:%S')))
            writefile.write('\nLanguages: {}'.format(languages))
            writefile.write('\naccuracy (train): {}'.format(
                self.pipeline.score(X_train, y_train)))
            writefile.write('\nperformance metrics (train): {}'.format(
                classification_report(y_train, pred_x_train)))
            writefile.write('\naccuracy (test): {}'.format(
                self.pipeline.score(X_test, y_test)))
            writefile.write('\nperformance metrics (test): {}'.format(
                classification_report(y_test, pred_x_test)))
            writefile.write(
                '\n===============================================')
            writefile.close()

#
# train_mod_instance = TrainMods()
#
# train_mod_instance.generate_model(['ie'])
# train_mod_instance.generate_model(['nl'])
# train_mod_instance.generate_model(['de'])
# train_mod_instance.generate_model([c.COUNTRIES.SWITZERLAND,
#                                    c.COUNTRIES.GERMANY,
#                                    c.COUNTRIES.FRANCE,
#                                    c.COUNTRIES.AUSTRIA,
#                                    c.COUNTRIES.ITALY])

Classes

class TrainMods

Train Models

class TrainMods:
    """
    Train Models
    """
    def __init__(self, mod: str='sgd'):
        supported_mods = ['sgd', 'gbc', 'nb']
        assert mod in supported_mods, ('Model {} not supported, please choose' 
                                      'one of the following options: {}'
                                       ).format(mod, supported_mods)

        if mod=='sgd':
            param_dist = {
                'alpha': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
                'max_iter': [1000],  # number of epochs
                'penalty': ['l2'],
                'n_jobs': [-1]
            }

            model = GridSearchCV(SGDClassifier(loss='log'),
                                 param_dist,
                                 cv=5,
                                 iid=False,
                                 n_jobs=-1
                                 )
        elif mod=='gbc':
            param_dist = {
                'loss': ['deviance'],
                'learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
                'min_samples_split': np.linspace(0.1, 0.5, 12),
                'min_samples_leaf': np.linspace(0.1, 0.5, 12),
                'max_depth': [3, 5, 8],
                'max_features': ['log2', 'sqrt'],
                'criterion': ['friedman_mse', 'mae'],
                'subsample': [0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
                'n_estimators': [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
            }

            model = RandomizedSearchCV(GradientBoostingClassifier(),
                                        param_distributions=param_dist,
                                        cv=3,
                                        n_iter=10,
                                        n_jobs=-1)

        elif mod == 'nb':
            param_dist = {
                'alpha': np.linspace(0.1, 1.5, 6),
                'fit_prior': [True, False],
            }

            model = RandomizedSearchCV(MultinomialNB(),
                                        param_distributions=param_dist,
                                        cv=10,
                                        n_iter=10,
                                        n_jobs=-1)

        self.pipeline = make_pipeline(DictVectorizer(),
                                      model
                                      )

    @staticmethod
    def featurize_name(name: str = 'x'):
        name = str(name).lower().replace('ue', 'ü'
                                ).replace('oe', 'ö'
                                ).replace('ae', 'ä'
                                ).replace('ss', 'ß'
                                ).replace('+', ' '
                                ).replace('-', ' ')

        return {
            'l1': name[0],
            'l12': name[0:2],
            'l123': name[0:3],
            'lf1': name[-1],
            'lf12': name[-2:],
            'lf123': name[-3:],
            'lf1234': name[-4:],
            'second_and_last_letter_vowel': name[-1] in 'aeiouüöäy' and name[
                -2] in 'aeiouüöäy',
            'first_and_second_letter_vowel': name[0] in 'aeiouüöäy' and name[
                1] in 'aeiouüöäy',
            'last_letter_vowel': name[-1] in 'aeiouüöäy',
            'first_letter_vowel': name[0] in 'aeiouüöäy',
            'n_char': len(name),
            'n_vowels': sum([*map(name.lower().count, 'aeiouüöäy')])
        }

    def generate_model(self, languages: List[str]):
        df_list = []
        supported_lang = data_utils.get_class_attributes(c.COUNTRIES)
        assert all(i in supported_lang for i in languages), (
            'Language not supported '
            '(supported languages: {})'
            ).format(supported_lang)
        if 'en' in languages:
            MYPATH = os.path.join(os.getcwd(), 'ml_models', 'train_data',
                                  'en')
            onlyfiles = [os.path.join(MYPATH, f) for
                         f in listdir(MYPATH) if isfile(join(MYPATH, f))]

            dfs = list(map(lambda x: pd.read_csv(x, sep=',', header=None,
                                                 names=['first_name',
                                                        'gender', 'count']
                                                 ), onlyfiles))

            dfs = pd.concat(dfs)

            # Check freq
            dfs_freq = dfs.groupby(['first_name', 'gender'], as_index=False)[
                'count'].sum()
            dfs_freq = dfs_freq.reset_index().pivot('first_name', 'gender',
                                                    'count')
            dfs_freq = dfs_freq.fillna(0)
            dfs_freq.loc[:, 'perc'] = (
                        (dfs_freq['M'] - dfs_freq['F']) / (dfs_freq['M']
                                                           + dfs_freq['F']))
            dfs_freq.loc[:, 'gender'] = np.where(dfs_freq['perc'] > 0.001, 'M', 'F')

            final_df = dfs_freq.reset_index()[['first_name', 'gender']]
            final_df.loc[:, 'poli_dat'] = 0
            final_df.loc[:, 'language'] = ['en' for i in
                                           range(len(final_df.index))]

            df_list.append(final_df)

        # Load in extra tmp
        language_mapping = {'de': 'Germany',
                            'nl': 'the Netherlands',
                            'ch': 'Swiss',
                            'ie': 'Ireland',
                            'fr': 'France',
                            'be': 'Belgium',
                            'lu': 'Luxembourg',
                            'at': 'Austria',
                            'dn': 'Denmark',
                            'no': 'Norway',
                            'se': 'Sweden',
                            'en': 'Great Britain',
                            'it': 'Italy'}

        col_names = [language_mapping[i] for i in languages]

        df_other = pd.read_csv(
            os.path.join(os.getcwd(), 'ml_models', 'train_data',
                         'other', 'names.csv'),
            sep=';', header=0,
            usecols=['name', 'gender'].append(col_names))

        # Exclude if the name does not exist in any of the countries of interest
        condition = df_other.sum(axis=1) != 0.00
        df_other = df_other[condition]

        df_other.rename(columns={'name': 'first_name'}, inplace=True)
        condition = (df_other['gender'] == 'F') | (df_other['gender'] == 'M')

        other_final_df = df_other[condition][['first_name', 'gender']]

        # final_df = pd.concat([dfs, df_other], sort=True)
        other_final_df.drop_duplicates(inplace=True)
        other_final_df.dropna(how='any', inplace=True)
        other_final_df.loc[:, 'poli_dat'] = 0

        df_list.append(other_final_df)

        # Combine dfs
        final_df = pd.concat(df_list)

        # Subset on names of length > 2 char
        condition = final_df['first_name'].str.len() >= 2
        final_df = final_df[condition]

        features = np.vectorize(self.featurize_name)

        # Extract the features for the whole dataset
        X = features(final_df.loc[:, 'first_name'])  # X contains the features

        # Get the gender column
        y = final_df.loc[:, 'gender']

        X, y = shuffle(X, y, random_state=1234)
        train_prop = .7
        X_train, X_test = X[:int(train_prop * len(X))], X[int(
            train_prop * len(X)):]
        y_train, y_test = y[:int(train_prop * len(y))], y[int(
            train_prop * len(y)):]

        self.pipeline.fit(X_train, y_train)

        # Store the model
        filename = os.path.join('ml_models',
                                'trained_models',
                                'gender_model_' + '_'.join(
                                    languages) + '.pkl')

        joblib.dump(self.pipeline, filename)

        filename = os.path.join(os.getcwd(), 'ml_models', 'model_report.txt')
        if os.path.exists(filename):
            append_write = 'a'  # append if already exists
        else:
            append_write = 'w+'

        pred_x_train = self.pipeline.predict(X_train)
        pred_x_test = self.pipeline.predict(X_test)

        # Write out metrics on model performance
        with open(filename, append_write) as writefile:
            writefile.write(
                '================================================')
            writefile.write('\nModel run: {}'.format(
                datetime.datetime.fromtimestamp(time.time()).strftime(
                    '%Y-%m-%d %H:%M:%S')))
            writefile.write('\nLanguages: {}'.format(languages))
            writefile.write('\naccuracy (train): {}'.format(
                self.pipeline.score(X_train, y_train)))
            writefile.write('\nperformance metrics (train): {}'.format(
                classification_report(y_train, pred_x_train)))
            writefile.write('\naccuracy (test): {}'.format(
                self.pipeline.score(X_test, y_test)))
            writefile.write('\nperformance metrics (test): {}'.format(
                classification_report(y_test, pred_x_test)))
            writefile.write(
                '\n===============================================')
            writefile.close()

Ancestors (in MRO)

Static methods

def __init__(

self, mod='sgd')

Initialize self. See help(type(self)) for accurate signature.

def __init__(self, mod: str='sgd'):
    supported_mods = ['sgd', 'gbc', 'nb']
    assert mod in supported_mods, ('Model {} not supported, please choose' 
                                  'one of the following options: {}'
                                   ).format(mod, supported_mods)
    if mod=='sgd':
        param_dist = {
            'alpha': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
            'max_iter': [1000],  # number of epochs
            'penalty': ['l2'],
            'n_jobs': [-1]
        }
        model = GridSearchCV(SGDClassifier(loss='log'),
                             param_dist,
                             cv=5,
                             iid=False,
                             n_jobs=-1
                             )
    elif mod=='gbc':
        param_dist = {
            'loss': ['deviance'],
            'learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
            'min_samples_split': np.linspace(0.1, 0.5, 12),
            'min_samples_leaf': np.linspace(0.1, 0.5, 12),
            'max_depth': [3, 5, 8],
            'max_features': ['log2', 'sqrt'],
            'criterion': ['friedman_mse', 'mae'],
            'subsample': [0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
            'n_estimators': [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
        }
        model = RandomizedSearchCV(GradientBoostingClassifier(),
                                    param_distributions=param_dist,
                                    cv=3,
                                    n_iter=10,
                                    n_jobs=-1)
    elif mod == 'nb':
        param_dist = {
            'alpha': np.linspace(0.1, 1.5, 6),
            'fit_prior': [True, False],
        }
        model = RandomizedSearchCV(MultinomialNB(),
                                    param_distributions=param_dist,
                                    cv=10,
                                    n_iter=10,
                                    n_jobs=-1)
    self.pipeline = make_pipeline(DictVectorizer(),
                                  model
                                  )

def featurize_name(

name='x')

@staticmethod
def featurize_name(name: str = 'x'):
    name = str(name).lower().replace('ue', 'ü'
                            ).replace('oe', 'ö'
                            ).replace('ae', 'ä'
                            ).replace('ss', 'ß'
                            ).replace('+', ' '
                            ).replace('-', ' ')
    return {
        'l1': name[0],
        'l12': name[0:2],
        'l123': name[0:3],
        'lf1': name[-1],
        'lf12': name[-2:],
        'lf123': name[-3:],
        'lf1234': name[-4:],
        'second_and_last_letter_vowel': name[-1] in 'aeiouüöäy' and name[
            -2] in 'aeiouüöäy',
        'first_and_second_letter_vowel': name[0] in 'aeiouüöäy' and name[
            1] in 'aeiouüöäy',
        'last_letter_vowel': name[-1] in 'aeiouüöäy',
        'first_letter_vowel': name[0] in 'aeiouüöäy',
        'n_char': len(name),
        'n_vowels': sum([*map(name.lower().count, 'aeiouüöäy')])
    }

def generate_model(

self, languages)

def generate_model(self, languages: List[str]):
    df_list = []
    supported_lang = data_utils.get_class_attributes(c.COUNTRIES)
    assert all(i in supported_lang for i in languages), (
        'Language not supported '
        '(supported languages: {})'
        ).format(supported_lang)
    if 'en' in languages:
        MYPATH = os.path.join(os.getcwd(), 'ml_models', 'train_data',
                              'en')
        onlyfiles = [os.path.join(MYPATH, f) for
                     f in listdir(MYPATH) if isfile(join(MYPATH, f))]
        dfs = list(map(lambda x: pd.read_csv(x, sep=',', header=None,
                                             names=['first_name',
                                                    'gender', 'count']
                                             ), onlyfiles))
        dfs = pd.concat(dfs)
        # Check freq
        dfs_freq = dfs.groupby(['first_name', 'gender'], as_index=False)[
            'count'].sum()
        dfs_freq = dfs_freq.reset_index().pivot('first_name', 'gender',
                                                'count')
        dfs_freq = dfs_freq.fillna(0)
        dfs_freq.loc[:, 'perc'] = (
                    (dfs_freq['M'] - dfs_freq['F']) / (dfs_freq['M']
                                                       + dfs_freq['F']))
        dfs_freq.loc[:, 'gender'] = np.where(dfs_freq['perc'] > 0.001, 'M', 'F')
        final_df = dfs_freq.reset_index()[['first_name', 'gender']]
        final_df.loc[:, 'poli_dat'] = 0
        final_df.loc[:, 'language'] = ['en' for i in
                                       range(len(final_df.index))]
        df_list.append(final_df)
    # Load in extra tmp
    language_mapping = {'de': 'Germany',
                        'nl': 'the Netherlands',
                        'ch': 'Swiss',
                        'ie': 'Ireland',
                        'fr': 'France',
                        'be': 'Belgium',
                        'lu': 'Luxembourg',
                        'at': 'Austria',
                        'dn': 'Denmark',
                        'no': 'Norway',
                        'se': 'Sweden',
                        'en': 'Great Britain',
                        'it': 'Italy'}
    col_names = [language_mapping[i] for i in languages]
    df_other = pd.read_csv(
        os.path.join(os.getcwd(), 'ml_models', 'train_data',
                     'other', 'names.csv'),
        sep=';', header=0,
        usecols=['name', 'gender'].append(col_names))
    # Exclude if the name does not exist in any of the countries of interest
    condition = df_other.sum(axis=1) != 0.00
    df_other = df_other[condition]
    df_other.rename(columns={'name': 'first_name'}, inplace=True)
    condition = (df_other['gender'] == 'F') | (df_other['gender'] == 'M')
    other_final_df = df_other[condition][['first_name', 'gender']]
    # final_df = pd.concat([dfs, df_other], sort=True)
    other_final_df.drop_duplicates(inplace=True)
    other_final_df.dropna(how='any', inplace=True)
    other_final_df.loc[:, 'poli_dat'] = 0
    df_list.append(other_final_df)
    # Combine dfs
    final_df = pd.concat(df_list)
    # Subset on names of length > 2 char
    condition = final_df['first_name'].str.len() >= 2
    final_df = final_df[condition]
    features = np.vectorize(self.featurize_name)
    # Extract the features for the whole dataset
    X = features(final_df.loc[:, 'first_name'])  # X contains the features
    # Get the gender column
    y = final_df.loc[:, 'gender']
    X, y = shuffle(X, y, random_state=1234)
    train_prop = .7
    X_train, X_test = X[:int(train_prop * len(X))], X[int(
        train_prop * len(X)):]
    y_train, y_test = y[:int(train_prop * len(y))], y[int(
        train_prop * len(y)):]
    self.pipeline.fit(X_train, y_train)
    # Store the model
    filename = os.path.join('ml_models',
                            'trained_models',
                            'gender_model_' + '_'.join(
                                languages) + '.pkl')
    joblib.dump(self.pipeline, filename)
    filename = os.path.join(os.getcwd(), 'ml_models', 'model_report.txt')
    if os.path.exists(filename):
        append_write = 'a'  # append if already exists
    else:
        append_write = 'w+'
    pred_x_train = self.pipeline.predict(X_train)
    pred_x_test = self.pipeline.predict(X_test)
    # Write out metrics on model performance
    with open(filename, append_write) as writefile:
        writefile.write(
            '================================================')
        writefile.write('\nModel run: {}'.format(
            datetime.datetime.fromtimestamp(time.time()).strftime(
                '%Y-%m-%d %H:%M:%S')))
        writefile.write('\nLanguages: {}'.format(languages))
        writefile.write('\naccuracy (train): {}'.format(
            self.pipeline.score(X_train, y_train)))
        writefile.write('\nperformance metrics (train): {}'.format(
            classification_report(y_train, pred_x_train)))
        writefile.write('\naccuracy (test): {}'.format(
            self.pipeline.score(X_test, y_test)))
        writefile.write('\nperformance metrics (test): {}'.format(
            classification_report(y_test, pred_x_test)))
        writefile.write(
            '\n===============================================')
        writefile.close()

Instance variables

var pipeline