Source code for Pre-Processing Exporter

from __future__ import absolute_import
import sys, os

BASE_DIR = os.path.dirname(os.path.dirname(__file__))
sys.path.append(BASE_DIR)
import PMML44 as pml
from enums import *
exception_cols = list()

def get_preprocess_val(ppln_sans_predictor, initial_colnames, model): """ Generates elements related to pre-processing Parameters ---------- model : Contains an instance of Sklearn model ppln_sans_predictor : Contains an instance of Sklearn Pipeline initial_colnames : list Contains list of feature/column names. Returns ------- pml_pp: dictionary Returns a dictionary that contains data related to pre-processing """ pml_pp = dict() pml_derived_flds = list() initial_colnames = [col_name for col_name in initial_colnames] updated_colnames = initial_colnames.copy() dtd_feat_names = list() classes = list() class_attribute = list() mining_strategy = list() mining_replacement_val = list() mining_attributes = list() pml_trfm_dict = None polynomial_features.poly_ctr = 0 pca.counter = 0 imputer.col_names = initial_colnames for ppln_step in ppln_sans_predictor: ppln_step_inst = ppln_step[1] if "DataFrameMapper" == get_class_name(ppln_step_inst): dfm_steps = ppln_step_inst.features dfm_col_names = list() for dfm_step in dfm_steps: dfm_step_col_names = dfm_step[0] dfm_step_trfms = dfm_step[1] if not dfm_step_trfms: for col in dfm_step_col_names: if col not in dtd_feat_names: dtd_feat_names.append(col) for col in dfm_step_col_names: if col not in dfm_col_names: dfm_col_names.append(col) continue if not hasattr(dfm_step_col_names, "__len__") or isinstance(dfm_step_col_names, str): dfm_step_col_names = [dfm_step_col_names] if not hasattr(dfm_step_trfms, "__len__") or isinstance(dfm_step_trfms, str): dfm_step_trfms = [dfm_step_trfms] for name in dfm_step_col_names: if name not in dtd_feat_names: dtd_feat_names.append(name) for trfm in dfm_step_trfms: pp_dict = get_pml_derived_flds(trfm, dfm_step_col_names, model=model) derived_flds = pp_dict['der_fld'] derived_names = pp_dict['der_col_names'] if 'pp_feat_class_lbl' in pp_dict.keys(): classes.append(pp_dict['pp_feat_class_lbl']) class_attribute.append(pp_dict['pp_feat_name']) if 'pp_feat_class_ohe' in pp_dict.keys(): classes.append(pp_dict['pp_feat_class_ohe']) class_attribute.append(pp_dict['pp_feat_name']) if 'mining_strategy' in pp_dict.keys(): mining_attributes.append(pp_dict['der_col_names']) mining_strategy.append(pp_dict['mining_strategy']) mining_replacement_val.append(pp_dict['mining_replacement_val']) pml_derived_flds.extend(derived_flds) dfm_step_col_names = derived_names dfm_col_names.extend(derived_names) updated_colnames = dfm_col_names else: if not dtd_feat_names: dtd_feat_names = initial_colnames updated_colnames = initial_colnames if not hasattr(ppln_step_inst, "__len__") or isinstance(ppln_step_inst, str): ppln_step_inst = [ppln_step_inst] for trfm in ppln_step_inst: pp_dict = get_pml_derived_flds(trfm, updated_colnames, model=model) derived_flds = pp_dict['der_fld'] derived_names = pp_dict['der_col_names'] if 'pp_feat_class_lbl' in pp_dict.keys(): classes.append(pp_dict['pp_feat_class_lbl']) class_attribute.append(pp_dict['pp_feat_name']) if 'pp_feat_class_ohe' in pp_dict.keys(): classes.append(pp_dict['pp_feat_class_ohe']) class_attribute.append(pp_dict['pp_feat_name']) if 'mining_strategy' in pp_dict.keys(): mining_attributes.append(pp_dict['der_col_names']) mining_strategy.append(pp_dict['mining_strategy']) mining_replacement_val.append(pp_dict['mining_replacement_val']) pml_derived_flds.extend(derived_flds) updated_colnames = derived_names if pml_derived_flds: pml_trfm_dict = pml.TransformationDictionary(DerivedField=pml_derived_flds) pml_pp['trfm_dict'] = pml_trfm_dict pml_pp['derived_col_names'] = updated_colnames pml_pp['preprocessed_col_names'] = dtd_feat_names pml_pp['categorical_feat_values'] = classes, class_attribute pml_pp['mining_imp_values'] = mining_attributes, mining_strategy, mining_replacement_val return pml_pp
def get_class_name(cls): """ Provides the class name for the given instance Parameters ---------- cls : Contains the Sklearn's preprocessing instance Returns ------- Returns the class name of the pre-processed object. """ return cls.__class__.__name__
def get_pml_derived_flds(trfm, col_names, **kwargs): """ Generates elements related to pre-processing for a given transformer object Parameters ---------- trfm : Contains the Sklearn's preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pml_pp: dictionary Returns a dictionary that contains attributes related to any preprocessing function . """ if "StandardScaler" == get_class_name(trfm): return std_scaler(trfm, col_names, **kwargs) elif "MinMaxScaler" == get_class_name(trfm): return min_max_scaler(trfm, col_names) elif "RobustScaler" == get_class_name(trfm): return rbst_scaler(trfm, col_names) elif "MaxAbsScaler" == get_class_name(trfm): return max_abs_scaler(trfm, col_names) elif "TfidfVectorizer" == get_class_name(trfm): return tfidf_vectorizer(trfm, col_names) elif "CountVectorizer" == get_class_name(trfm): return count_vectorizer(trfm, col_names) elif "LabelEncoder" == get_class_name(trfm): return lbl_encoder(trfm, col_names) elif "Imputer" == get_class_name(trfm): return imputer(trfm, col_names, **kwargs) elif "Binarizer" == get_class_name(trfm): return binarizer(trfm, col_names) elif "PolynomialFeatures" == get_class_name(trfm): return polynomial_features(trfm, col_names) elif "PCA" == get_class_name(trfm): return pca(trfm, col_names) elif "LabelBinarizer" == get_class_name(trfm): return lbl_binarizer(trfm, col_names, **kwargs) elif "OneHotEncoder"==get_class_name(trfm): return one_hot_encoder(trfm,col_names,**kwargs) elif "CategoricalImputer" == get_class_name(trfm): return cat_imputer(trfm, col_names) elif "Lag" == get_class_name(trfm): return lag(trfm, col_names) else: raise TypeError("This PreProcessing Task is not Supported")
def get_derived_colnames(trfm_name, col_names, *args): """ Generates derived column names for a given transformer Parameters ---------- trfm_name : String Name of the derived field to be assigned after preprocessing col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pml_pp: list Returns a list that contains names of the preprocessed features. """ extra_symbol = "" if args: extra_symbol = args[0] derived_colnames = list() for col_name in col_names: derived_colnames.append(trfm_name + '(' + str(col_name) + ')' + extra_symbol) return derived_colnames
def any_in(seq_a, seq_b): """ Checks for common elements in two given sequence elements Parameters ---------- seq_a : list A list of items seq_b : list A list of items Returns ------- Returns a boolean value if any item of seq_a belongs to seq_b or visa versa """ return any(elem in seq_b for elem in seq_a)
# Methods for Preprocessings
def imputer(trfm, col_names, **kwargs): """ Generates pre-processing elements for Scikit-Learn's Imputer Parameters ---------- trfm : Contains the Sklearn's Imputer preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Imputer preprocessing. """ original_col_names = imputer.col_names derived_colnames = col_names pp_dict = dict() derived_flds = list() model = kwargs['model'] mining_strategy = trfm.strategy if "mean" in mining_strategy: mining_strategy = MISSING_VALUE_TREATMENT_METHOD.AS_MEAN.value elif "median" in mining_strategy: mining_strategy = MISSING_VALUE_TREATMENT_METHOD.AS_MEDIAN.value elif "most_frequent" in mining_strategy: mining_strategy = MISSING_VALUE_TREATMENT_METHOD.AS_MODE.value mining_replacement_val = trfm.statistics_ if not any_in(original_col_names, col_names): derived_colnames = get_derived_colnames('imputer', col_names) for col_name_idx in range(len(col_names)): if (col_names[col_name_idx] not in exception_cols): const_list = list() apply_inner = list() apply_inner.append(pml.Apply(function=FUNCTION.IS_MISSING.value, FieldRef=[pml.FieldRef(field=col_names[col_name_idx])])) const_obj = pml.Constant( dataType=DATATYPE.DOUBLE.value, valueOf_=mining_replacement_val[col_name_idx] ), fieldref_obj = pml.FieldRef(field=col_names[col_name_idx]) fieldref_obj.original_tagname_ = "FieldRef" const_list.append(const_obj[0]) const_list.append(fieldref_obj) apply_outer = pml.Apply( Apply_member=apply_inner, function=FUNCTION.IF.value, Constant=const_list ) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value )) else: pp_dict['mining_strategy'] = mining_strategy pp_dict['mining_replacement_val'] = mining_replacement_val pp_dict['mining_attributes'] = col_names pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def cat_imputer(trfm, col_names): """ Generates pre-processing elements for sklearn-pandas' CategoricalImputer Parameters ---------- trfm : Contains the Sklearn's Imputer preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Imputer preprocessing. """ derived_colnames = col_names pp_dict = dict() derived_flds = list() mining_strategy = MISSING_VALUE_TREATMENT_METHOD.AS_MODE.value mining_replacement_val = trfm.fill_ pp_dict['mining_strategy'] = mining_strategy pp_dict['mining_replacement_val'] = mining_replacement_val pp_dict['mining_attributes'] = col_names pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def pca(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's PCA Parameters ---------- trfm : Contains the Sklearn's PCA preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to PCA preprocessing. """ pca.counter += 1 pp_dict = dict() derived_flds = list() derived_colnames = list() val = trfm.mean_ zero = 0.0 for preprocess_idx in range(trfm.n_components_): add = list() for pca_idx in range(trfm.n_features_): apply_inner = pml.Apply(function=FUNCTION.SUBSTRACTTION.value, Constant=[pml.Constant(dataType=DATATYPE.DOUBLE.value, valueOf_="{:.16f}".format(val[pca_idx]))], FieldRef=[pml.FieldRef(field=col_names[pca_idx])]) apply_outer = pml.Apply(function=FUNCTION.MULTIPLICATION.value, Apply_member=[apply_inner], Constant=[pml.Constant(dataType=DATATYPE.DOUBLE.value, valueOf_=zero if trfm.components_[preprocess_idx][ pca_idx] == 0.0 else "{:.16f}".format(trfm.components_[preprocess_idx][pca_idx]))]) add.append(apply_outer) app0 = pml.Apply(function=FUNCTION.SUM.value, Apply_member=add) derived_flds.append(pml.DerivedField(Apply=app0, dataType=DATATYPE.DOUBLE.value, optype=OPTYPE.CONTINUOUS.value, name="PCA" + str(pca.counter) + "-" + str(preprocess_idx))) name = derived_flds[preprocess_idx].get_name() derived_colnames.append(name) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def tfidf_vectorizer(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's TfIdfVectorizer Parameters ---------- trfm : Contains the Sklearn's TfIdfVectorizer preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to TfIdfVectorizer preprocessing. """ pp_dict = dict() features = [str(feat.encode("utf8"))[2:-1] for feat in trfm.get_feature_names()] idfs = trfm.idf_ extra_features = list(trfm.vocabulary_.keys()) derived_flds = list() derived_colnames = get_derived_colnames('tfidf@[' + col_names[0] + ']', features) if trfm.lowercase: derived_flds.append( pml.DerivedField(name='lowercase(' + col_names[0] + ')', optype=OPTYPE.CATEGORICAL.value, dataType=DATATYPE.STRING.value, Apply=pml.Apply(function=FUNCTION.LOWERCASE.value, FieldRef=[pml.FieldRef(field=col_names[0])]))) for feat_idx, idf in zip(range(len(features)), idfs): derived_flds.append(pml.DerivedField( name = derived_colnames[feat_idx], optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value, Apply=pml.Apply(function=FUNCTION.MULTIPLICATION.value, TextIndex=[pml.TextIndex(textField='lowercase(' + col_names[0] + ')', wordSeparatorCharacterRE='\\s+', tokenize='true', Constant=pml.Constant(valueOf_=features[feat_idx]), Extension=[pml.Extension(value=extra_features[feat_idx])])], Constant=[pml.Constant(valueOf_="{:.16f}".format(idf))]) )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_name'] = col_names[0] pp_dict['pp_feat_class_lbl'] = list() return pp_dict
def count_vectorizer(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's CountVectorizer Parameters ---------- trfm : Contains the Sklearn's CountVectorizer preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to CountVectorizer preprocessing. """ pp_dict = dict() features = [str(feat.encode("utf8"))[2:-1] for feat in trfm.get_feature_names()] extra_features = list(trfm.vocabulary_.keys()) derived_flds = list() derived_colnames = get_derived_colnames('count_vec@[' + col_names[0] + ']', features) if trfm.lowercase: derived_flds.append(pml.DerivedField(name='lowercase(' + col_names[0] + ')', optype=OPTYPE.CATEGORICAL.value, dataType=DATATYPE.STRING.value, Apply=pml.Apply(function=FUNCTION.LOWERCASE.value, FieldRef=[pml.FieldRef(field=col_names[0])]))) for imp_features, index in zip(features, range(len(features))): df_name = derived_colnames[index] derived_flds.append(pml.DerivedField(name=df_name, optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value, TextIndex=pml.TextIndex(textField='lowercase(' + col_names[0] + ')' if trfm.lowercase \ else col_names[0], wordSeparatorCharacterRE='\\s+', tokenize='true', Constant=pml.Constant(dataType=DATATYPE.STRING.value, valueOf_=imp_features), Extension=[pml.Extension(value=extra_features[index])] ))) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_name'] = col_names[0] pp_dict['pp_feat_class_lbl'] = list() return pp_dict
def lag(trfm, col_names): """ Generates pre-processing elements for Nyoka's Lag Parameters ---------- trfm : Contains the Nyoka's Lag instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Lag preprocessing. """ derived_flds = list() pp_dict = dict() derived_colnames = get_derived_colnames(trfm.aggregation, col_names) for idx, name in enumerate(col_names): lag = pml.Lag(field=name, n=trfm.value, aggregate=trfm.aggregation) derived_fld = pml.DerivedField(name=derived_colnames[idx], Lag=lag, optype=OPTYPE.CONTINUOUS.value,\ dataType=DATATYPE.DOUBLE.value) derived_flds.append(derived_fld) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def std_scaler(trfm, col_names, **kwargs): """ Generates pre-processing elements for Scikit-Learn's StandardScaler Parameters ---------- trfm : Contains the Sklearn's Standard Scaler preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Standard Scaler preprocessing. """ derived_flds = list() pp_dict = dict() derived_colnames = get_derived_colnames('standardScaler', col_names) for col_name_idx in range(len(col_names)): apply_inner = list() apply_inner.append(pml.Apply( function=FUNCTION.SUBSTRACTTION.value, Constant=[pml.Constant( dataType=DATATYPE.DOUBLE.value, valueOf_="{:.16f}".format(trfm.mean_[col_name_idx]) )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])] )) apply_outer = pml.Apply( Apply_member=apply_inner, function=FUNCTION.DIVISION.value, Constant=[pml.Constant( dataType=DATATYPE.DOUBLE.value, valueOf_="{:.16f}".format(trfm.scale_[col_name_idx]) )] ) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def min_max_scaler(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's MinMaxScaler Parameters ---------- trfm : Contains the Sklearn's MinMaxScaler preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to MinMaxScaler preprocessing. """ pp_dict = dict() derived_flds = list() derived_colnames = get_derived_colnames("minMaxScaler", col_names) for col_name_idx in range(len(col_names)): if(col_names[col_name_idx] not in exception_cols): apply_inner = list() apply_inner.append(pml.Apply( function=FUNCTION.MULTIPLICATION.value, Constant=[pml.Constant( dataType=DATATYPE.DOUBLE.value, valueOf_="{:.16f}".format(trfm.scale_[col_name_idx]) )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])] )) apply_outer = pml.Apply( Apply_member=apply_inner, function=FUNCTION.ADDITION.value, Constant=[pml.Constant( dataType=DATATYPE.DOUBLE.value, valueOf_="{:.16f}".format(trfm.min_[col_name_idx]) )] ) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def rbst_scaler(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's RobustScaler Parameters ---------- trfm : Contains the Sklearn's RobustScaler preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to RobustScaler preprocessing. """ pp_dict = dict() derived_flds = list() derived_colnames = get_derived_colnames('robustScaler', col_names) for col_name_idx in range(len(col_names)): if (col_names[col_name_idx] not in exception_cols): apply_inner = list() apply_inner.append(pml.Apply( function=FUNCTION.SUBSTRACTTION.value, Constant=[pml.Constant( dataType=DATATYPE.DOUBLE.value, valueOf_="{:.16f}".format(trfm.center_[col_name_idx]) )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])] )) apply_outer = pml.Apply( Apply_member=apply_inner, function=FUNCTION.DIVISION.value, Constant=[pml.Constant( dataType=DATATYPE.DOUBLE.value, valueOf_="{:.16f}".format(trfm.scale_[col_name_idx]) )] ) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def max_abs_scaler(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's MaxAbsScaler Parameters ---------- trfm : Contains the Sklearn's MaxabsScaler preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to MaxabsScaler preprocessing. """ pp_dict = dict() derived_flds = list() derived_colnames = get_derived_colnames('maxAbsScaler', col_names) for col_name_idx in range(len(col_names)): if (col_names[col_name_idx] not in exception_cols): apply_outer = pml.Apply( function=FUNCTION.DIVISION.value, Constant=[pml.Constant( dataType=DATATYPE.DOUBLE.value, valueOf_="{:.16f}".format(trfm.max_abs_[col_name_idx]) )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])] ) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def lbl_encoder(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's LabelEncoder Parameters ---------- trfm : Contains the Sklearn's LabelEncoder preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to LabelEncoder preprocessing. """ pp_dict = dict() derived_flds = list() field_column_pair = list() rows = [] categoric_lbls = trfm.classes_.tolist() categoric_lbls_num = trfm.transform(trfm.classes_.tolist()).tolist() derived_colnames = get_derived_colnames('labelEncoder', col_names) for row_idx in range(len(categoric_lbls_num)): row_main = pml.row() row_main.elementobjs_ = ['input', 'output'] row_main.input = categoric_lbls[row_idx] row_main.output = str(categoric_lbls_num[row_idx]) rows.append(row_main) field_column_pair.append(pml.FieldColumnPair(field=str(col_names[0]), column="input")) inline_table = pml.InlineTable(row=rows) map_values = pml.MapValues(outputColumn="output", FieldColumnPair=field_column_pair, InlineTable=inline_table) derived_flds.append( pml.DerivedField(MapValues=map_values, name=derived_colnames[0], optype=OPTYPE.CONTINUOUS.value,\ dataType=DATATYPE.DOUBLE.value)) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_class_lbl'] = categoric_lbls pp_dict['pp_feat_name'] = col_names[0] return pp_dict
def binarizer(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's Binarizer Parameters ---------- trfm : Contains the Sklearn's Binarizer preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Binarizer preprocessing. """ pp_dict = dict() derived_flds = list() derived_colnames = get_derived_colnames("binarizer", col_names) for col_name_idx in range(len(col_names)): apply_outer = pml.Apply( function=FUNCTION.THRESHOLD.value, Constant=[pml.Constant( dataType=DATATYPE.DOUBLE.value, valueOf_=trfm.threshold )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def polynomial_features(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's PolynomialFeatures Parameters ---------- trfm : Contains the Sklearn's PolynomialFeatures preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to PolynomialFeatures preprocessing. """ polynomial_features.poly_ctr += 1 pp_dict = dict() derived_flds = [] derived_colnames = [] for polyfeat_idx in range(trfm.powers_.shape[0]): apply_inner_container = [] for col_name_idx in range(len(col_names)): val = int(trfm.powers_[polyfeat_idx][col_name_idx]) apply_inner = pml.Apply( function=FUNCTION.POWER.value, Constant=[pml.Constant( dataType=DATATYPE.INTEGER.value, valueOf_=val )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]) apply_inner_container.append(apply_inner) apply_outer = pml.Apply(function=FUNCTION.PRODUCT.value, Apply_member=apply_inner_container ) derived_flds.append(pml.DerivedField( Apply=apply_outer, dataType=DATATYPE.DOUBLE.value, optype=OPTYPE.CONTINUOUS.value, name="poly" + str(polynomial_features.poly_ctr) + '-' + "x" + str(polyfeat_idx) )) name = derived_flds[polyfeat_idx].get_name() derived_colnames.append(name) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def lbl_binarizer(trfm, col_names, **kwargs): """ Generates pre-processing elements for Scikit-Learn's LabelBinarizer Parameters ---------- trfm : Contains the Sklearn's Label Binarizer preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Label Binarizer preprocessing. """ derived_flds = list() derived_colnames = list() pp_dict = dict() categoric_lbls = trfm.classes_.tolist() for col_name_idx in range(len(col_names)): if len(categoric_lbls) == 2: derived_colnames = get_derived_colnames("labelBinarizer(" + str(col_names[col_name_idx]), [categoric_lbls[-1]], ")") norm_descr = pml.NormDiscrete(field=str(col_names[-1]), value=str(categoric_lbls[-1])) derived_flds.append(pml.DerivedField(NormDiscrete=norm_descr, name=derived_colnames[-1], optype=OPTYPE.CATEGORICAL.value, dataType=DATATYPE.DOUBLE.value)) else: derived_colnames = get_derived_colnames("labelBinarizer(" + str(col_names[col_name_idx]), categoric_lbls, ")") for attribute_name in col_names: for class_name, class_idx in zip(categoric_lbls, range(len(categoric_lbls))): norm_descr = pml.NormDiscrete(field=str(attribute_name), value=str(class_name)) derived_flds.append( pml.DerivedField(NormDiscrete=norm_descr, name=derived_colnames[class_idx], optype=OPTYPE.CATEGORICAL.value, dataType=DATATYPE.DOUBLE.value)) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_class_lbl'] = categoric_lbls pp_dict['pp_feat_name'] = col_names[0] return pp_dict
def one_hot_encoder(trfm, col_names, **kwargs): """ Generates pre-processing elements for Scikit-Learn's OneHotEncoder Parameters ---------- trfm : Contains the Sklearn's One hot encoder preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Label Binarizer preprocessing. """ derived_flds = list() derived_colnames = list() pp_dict = dict() categoric_lbls = trfm.categories_[0].tolist() for col_name_idx in range(len(col_names)): derived_colnames = get_derived_colnames("oneHotEncoder(" + str(col_names[col_name_idx]), categoric_lbls, ")") for attribute_name in col_names: for class_name, class_idx in zip(categoric_lbls, range(len(categoric_lbls))): norm_descr = pml.NormDiscrete(field=str(attribute_name), value=str(class_name)) derived_flds.append( pml.DerivedField(NormDiscrete=norm_descr, name=derived_colnames[class_idx], optype=OPTYPE.CATEGORICAL.value, dataType=DATATYPE.DOUBLE.value)) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_class_ohe'] = categoric_lbls pp_dict['pp_feat_name'] = col_names[0] return pp_dict