from __future__ import absolute_import
import sys, os
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
sys.path.append(BASE_DIR)
import PMML44 as pml
import pre_process as pp
from datetime import datetime
import math
import metadata
from enums import *
def skl_to_pmml(pipeline, col_names, target_name='target', pmml_f_name='from_sklearn.pmml', model_name=None, description=None):
"""
Exports scikit-learn pipeline object into pmml
Parameters
----------
pipeline :
Contains an instance of Pipeline with preprocessing and final estimator
col_names : List
Contains list of feature/column names.
target_name : String
Name of the target column. (Default='target')
pmml_f_name : String
Name of the pmml file. (Default='from_sklearn.pmml')
model_name : string (optional)
Name of the model
description : string (optional)
Description of the model
Returns
-------
Generates a PMML object and exports it to `pmml_f_name`
"""
try:
model = pipeline.steps[-1][1]
except:
raise TypeError("Exporter expects pipeleine_instance and not an estimator_instance")
else:
import numpy as np
if isinstance(col_names, np.ndarray):
col_names = col_names.tolist()
ppln_sans_predictor = pipeline.steps[:-1]
trfm_dict_kwargs = dict()
derived_col_names = col_names
categoric_values = tuple()
mining_imp_val = tuple()
if ppln_sans_predictor:
pml_pp = pp.get_preprocess_val(ppln_sans_predictor, col_names, model)
trfm_dict_kwargs['TransformationDictionary'] = pml_pp['trfm_dict']
derived_col_names = pml_pp['derived_col_names']
col_names = pml_pp['preprocessed_col_names']
categoric_values = pml_pp['categorical_feat_values']
mining_imp_val = pml_pp['mining_imp_values']
PMML_kwargs = get_PMML_kwargs(model,
derived_col_names,
col_names,
target_name,
mining_imp_val,
categoric_values,
model_name)
pmml = pml.PMML(
version=PMML_SCHEMA.VERSION.value,
Header=get_header(description),
DataDictionary=get_data_dictionary(model, col_names, target_name, categoric_values),
**trfm_dict_kwargs,
**PMML_kwargs
)
pmml.export(outfile=open(pmml_f_name, "w"), level=0)
def any_in(seq_a, seq_b):
"""
Checks for common elements in two given sequence elements
Parameters
----------
seq_a : list
A list of items
seq_b : list
A list of items
Returns
-------
Returns a boolean value if any item of seq_a belongs to seq_b or visa versa
"""
return any(elem in seq_b for elem in seq_a)
def get_PMML_kwargs(model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values, model_name):
"""
It returns all the pmml elements.
Parameters
----------
model : Scikit-learn model object
An instance of Scikit-learn model.
derived_col_names : List
Contains column names after preprocessing
col_names : List
Contains list of feature/column names.
target_name : String
Name of the target column .
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
algo_kwargs : Dictionary
Get the PMML model argument based on scikit learn model object
"""
skl_mdl_super_cls_names = get_super_cls_names(model)
regression_model_names = ('LinearRegression','LinearSVR')
regression_mining_model_names = ('LogisticRegression', 'RidgeClassifier','LinearDiscriminantAnalysis', \
'SGDClassifier','LinearSVC',)
tree_model_names = ('BaseDecisionTree',)
support_vector_model_names = ('SVC', 'SVR')
anomaly_model_names = ('OneClassSVM','IsolationForest')
naive_bayes_model_names = ('GaussianNB',)
mining_model_names = ('RandomForestRegressor', 'RandomForestClassifier', 'GradientBoostingClassifier',
'GradientBoostingRegressor')
neurl_netwk_model_names = ('MLPClassifier', 'MLPRegressor')
nearest_neighbour_names = ('NeighborsBase',)
clustering_model_names = ('KMeans',)
if any_in(tree_model_names, skl_mdl_super_cls_names):
algo_kwargs = {'TreeModel': get_tree_models(model,
derived_col_names,
col_names,
target_name,
mining_imp_val,
categoric_values,
model_name)}
elif any_in(regression_mining_model_names, skl_mdl_super_cls_names):
if len(model.classes_) == 2:
algo_kwargs = {'RegressionModel': get_regrs_models(model,
derived_col_names,
col_names,
target_name,
mining_imp_val,
categoric_values,
model_name)}
else:
algo_kwargs = {'MiningModel': get_reg_mining_models(model,
derived_col_names,
col_names,
target_name,
mining_imp_val,
categoric_values,
model_name)}
elif any_in(regression_model_names, skl_mdl_super_cls_names):
algo_kwargs = {'RegressionModel': get_regrs_models(model,
derived_col_names,
col_names,
target_name,
mining_imp_val,
categoric_values,
model_name)}
elif any_in(support_vector_model_names, skl_mdl_super_cls_names):
algo_kwargs = {'SupportVectorMachineModel':
get_supportVectorMachine_models(model,
derived_col_names,
col_names,
target_name,
mining_imp_val,
categoric_values,
model_name)}
elif any_in(mining_model_names, skl_mdl_super_cls_names):
algo_kwargs = {'MiningModel': get_ensemble_models(model,
derived_col_names,
col_names,
target_name,
mining_imp_val,
categoric_values,
model_name)}
elif any_in(neurl_netwk_model_names, skl_mdl_super_cls_names):
algo_kwargs = {'NeuralNetwork': get_neural_models(model,
derived_col_names,
col_names,
target_name,
mining_imp_val,
categoric_values,
model_name)}
elif any_in(naive_bayes_model_names, skl_mdl_super_cls_names):
algo_kwargs = {'NaiveBayesModel': get_naiveBayesModel(model,
derived_col_names,
col_names,
target_name,
mining_imp_val,
categoric_values,
model_name)}
elif any_in(nearest_neighbour_names, skl_mdl_super_cls_names):
algo_kwargs = {'NearestNeighborModel':
get_nearestNeighbour_model(model,
derived_col_names,
col_names,
target_name,
mining_imp_val,
categoric_values,
model_name)}
elif any_in(anomaly_model_names, skl_mdl_super_cls_names):
algo_kwargs = {'AnomalyDetectionModel':
get_anomalydetection_model(model,
derived_col_names,
col_names,
target_name,
mining_imp_val,
categoric_values,
model_name)}
elif any_in(clustering_model_names, skl_mdl_super_cls_names):
algo_kwargs = {'ClusteringModel':
get_clustering_model(model,
derived_col_names,
col_names,
target_name,
mining_imp_val,
categoric_values,
model_name
)}
else:
raise NotImplementedError("{} is not Implemented!".format(model.__class__.__name__))
return algo_kwargs
def get_model_kwargs(model, col_names, target_name, mining_imp_val, categoric_values):
"""
It returns all the model element for a specific model.
Parameters
----------
model :
An instance of Scikit-learn model.
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
Returns
-------
model_kwargs : Dictionary
Returns function name, MiningSchema and Output of the sk_model object
"""
model_kwargs = dict()
model_kwargs['functionName'] = get_mining_func(model)
model_kwargs['MiningSchema'] = get_mining_schema(model, col_names, target_name, mining_imp_val, categoric_values)
model_kwargs['Output'] = get_output(model, target_name)
return model_kwargs
def get_reg_mining_models(model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values, model_name):
"""
Creates xml elements for multi-class linear models
Parameters
----------
model :
An instance of Scikit-learn model.
derived_col_names : List
Contains column names after preprocessing
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
mining_model : List
Returns a Nyoka's MiningModel object
"""
num_classes = len(model.classes_)
model_kwargs = get_model_kwargs(model, col_names, target_name, mining_imp_val, categoric_values)
mining_model = pml.MiningModel(modelName=model_name if model_name else model.__class__.__name__,**model_kwargs)
inner_mining_schema = [mfield for mfield in model_kwargs['MiningSchema'].MiningField if mfield.usageType != FIELD_USAGE_TYPE.TARGET.value]
segmentation = pml.Segmentation(multipleModelMethod=MULTIPLE_MODEL_METHOD.MODEL_CHAIN.value)
for idx in range(num_classes):
segment = pml.Segment(id=str(idx+1),True_=pml.True_())
segment.RegressionModel = pml.RegressionModel(
functionName=MINING_FUNCTION.REGRESSION.value,
MiningSchema=pml.MiningSchema(
MiningField=inner_mining_schema
),
Output=pml.Output(
OutputField=[
pml.OutputField(
name="probablity_"+str(idx),
optype=OPTYPE.CONTINUOUS.value,
dataType=DATATYPE.DOUBLE.value
)
]
),
RegressionTable=get_reg_tab_for_reg_mining_model(model,derived_col_names,idx,categoric_values)
)
if model.__class__.__name__ != 'LinearSVC':
segment.RegressionModel.normalizationMethod = REGRESSION_NORMALIZATION_METHOD.LOGISTIC.value
segmentation.add_Segment(segment)
last_segment = pml.Segment(id=str(num_classes+1),True_=pml.True_())
mining_flds_for_last = [pml.MiningField(name="probablity_"+str(idx)) for idx in range(num_classes)]
mining_flds_for_last.append(pml.MiningField(name=target_name,usageType=FIELD_USAGE_TYPE.TARGET.value))
mining_schema_for_last = pml.MiningSchema(MiningField=mining_flds_for_last)
reg_tab_for_last = list()
for idx in range(num_classes):
reg_tab_for_last.append(
pml.RegressionTable(
intercept="0.0",
targetCategory=str(model.classes_[idx]),
NumericPredictor=[pml.NumericPredictor(
name="probablity_"+str(idx),
coefficient="1.0"
)]
)
)
last_segment.RegressionModel = pml.RegressionModel(
functionName=MINING_FUNCTION.CLASSIFICATION.value,
MiningSchema=mining_schema_for_last,
RegressionTable=reg_tab_for_last
)
if model.__class__.__name__ != 'LinearSVC':
last_segment.RegressionModel.normalizationMethod = REGRESSION_NORMALIZATION_METHOD.SIMPLEMAX.value
segmentation.add_Segment(last_segment)
mining_model.set_Segmentation(segmentation)
return [mining_model]
def get_reg_tab_for_reg_mining_model(model, col_names, index, categorical_values):
"""
Generates Regression Table for multi-class linear models
Parameters
----------
model :
An instance of Scikit-learn model.
col_names : List
Contains list of feature/column names.
index : int
categoric_values : tuple
Contains Categorical attribute names and its values
Returns
-------
Returns Nyoka's RegressionTable object
"""
reg_tab = pml.RegressionTable(intercept="{:.16f}".format(model.intercept_[index]))
for idx, coef in enumerate(model.coef_[index]):
reg_tab.add_NumericPredictor(pml.NumericPredictor(name=col_names[idx],coefficient="{:.16f}".format(coef)))
return [reg_tab]
def get_anomalydetection_model(model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values, model_name):
"""
Creates xml elements for anomaly detction models
Parameters
----------
model :
An instance of Scikit-learn model.
derived_col_names : List
Contains column names after preprocessing
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
anomaly_detection_model : List
Returns Nyoka's AnomalyDetectionModel object
"""
anomaly_detection_model = list()
if 'OneClassSVM' in str(model.__class__):
svm_model = get_supportVectorMachine_models(model,
derived_col_names,
col_names,
target_name,
mining_imp_val,
categoric_values, model_name)[0]
anomaly_detection_model.append(
pml.AnomalyDetectionModel(
modelName=model_name if model_name else model.__class__.__name__,
algorithmType=ANOMALY_DETECTION_ALGORITHM.ONE_CLASS_SVM.value,
functionName=MINING_FUNCTION.REGRESSION.value,
MiningSchema=get_mining_schema(model, col_names, target_name, mining_imp_val,categoric_values),
Output=get_anomaly_detection_output(model),
SupportVectorMachineModel=svm_model
)
)
else:
mining_schema = get_mining_schema(model, col_names, target_name, mining_imp_val,categoric_values)
ensemble_model = get_ensemble_models(model,
derived_col_names,
col_names,
'avg_path_length',
mining_imp_val,
categoric_values, model_name)[0]
anomaly_detection_model.append(
pml.AnomalyDetectionModel(
modelName=model_name if model_name else "IsolationForest",
algorithmType=ANOMALY_DETECTION_ALGORITHM.ISOLATION_FOREST.value,
functionName=MINING_FUNCTION.REGRESSION.value,
MiningSchema=mining_schema,
Output=get_anomaly_detection_output(model),
sampleDataSize=str(model.max_samples_),
MiningModel=ensemble_model
)
)
return anomaly_detection_model
def get_anomaly_detection_output(model):
"""
Generates output for anomaly detection models
Parameters
----------
model :
Scikit-learn's model object
Returns
-------
output_fields :
Returns Nyoka's Output object
"""
output_fields = list()
output_fields.append(pml.OutputField(name="anomalyScore",
optype=OPTYPE.CONTINUOUS.value,
dataType=DATATYPE.DOUBLE.value,
feature=RESULT_FEATURE.PREDICTED_VALUE.value,
isFinalResult="false"))
thresh = 0
try:
thresh = model.threshold_
except:
thresh = 0
offset = 0
operator = SIMPLE_PREDICATE_OPERATOR.LESS_THAN.value
if model.__class__.__name__ == "IsolationForest":
operator = SIMPLE_PREDICATE_OPERATOR.GREATER_THAN.value
offset = model.offset_
thresh = -1 * (thresh + offset)
output_fields.append(
pml.OutputField(name="outlier",
optype=OPTYPE.CATEGORICAL.value,
dataType=DATATYPE.BOOLEAN.value,
feature=RESULT_FEATURE.DECISION.value,
isFinalResult="true",
Apply=pml.Apply(function=operator,
FieldRef=[pml.FieldRef(field="anomalyScore")],
Constant=[pml.Constant(dataType=DATATYPE.DOUBLE.value,
valueOf_="0" if thresh==0 else "{:.16f}".format(thresh))]))
)
return pml.Output(OutputField=output_fields)
def get_clustering_model(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values,model_name):
"""
Generates PMML elements for clustering models
Parameters
----------
model :
An instance of Scikit-learn model.
derived_col_names : List
Contains column names after preprocessing
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
clustering_models : List
Returns Nyoka's ClusteringModel object
"""
import numpy as np
clustering_models = list()
model_kwargs = get_model_kwargs(model, col_names, target_name, mining_imp_val,categoric_values)
values, counts = np.unique(model.labels_,return_counts=True)
model_kwargs["Output"] = get_output_for_clustering(values)
clustering_models.append(
pml.ClusteringModel(
modelClass=CLUSTERING_MODEL_CLASS.CENTER_BASED.value,
modelName=model_name if model_name else model.__class__.__name__,
numberOfClusters=get_cluster_num(model),
ComparisonMeasure=get_comp_measure(),
ClusteringField=get_clustering_flds(derived_col_names),
Cluster=get_cluster_vals(model,counts),
**model_kwargs
)
)
return clustering_models
def get_output_for_clustering(values):
"""
Generates output for clustering models
Parameters
----------
model :
An instance of Scikit-learn model.
Returns
-------
output_fields : List
Returns Nyoka's Output object
"""
output_fields = list()
for idx, val in enumerate(values):
output_fields.append(
pml.OutputField(
name="affinity("+str(idx)+")",
optype=OPTYPE.CONTINUOUS.value,
dataType=DATATYPE.DOUBLE.value,
feature=RESULT_FEATURE.ENTITY_AFFINITY.value,
value=str(val)
)
)
output_fields.append(pml.OutputField(name="cluster", optype=OPTYPE.CATEGORICAL.value,\
dataType=DATATYPE.STRING.value,feature=RESULT_FEATURE.PREDICTED_VALUE.value))
return pml.Output(OutputField=output_fields)
def get_cluster_vals(model,counts):
"""
Generates cluster information for clustering models
Parameters
----------
model :
An instance of Scikit-learn model.
Returns
-------
cluster_flds : List
Returns Nyoka's Cluster object
"""
centroids = model.cluster_centers_
cluster_flds = []
for centroid_idx in range(centroids.shape[0]):
centroid_values = ""
centroid_flds = pml.ArrayType(type_=ARRAY_TYPE.REAL.value)
for centroid_cordinate_idx in range(centroids.shape[1]):
centroid_flds.content_[0].value = centroid_values + "{:.16f}".format(centroids[centroid_idx][centroid_cordinate_idx])
centroid_values = centroid_flds.content_[0].value + " "
cluster_flds.append(pml.Cluster(id=str(centroid_idx), Array=centroid_flds,size=str(counts[centroid_idx])))
return cluster_flds
def get_cluster_num(model):
"""
Returns number of cluster for clustering models
Parameters
----------
model :
An instance of Scikit-learn model.
Returns
-------
model.n_clusters: Integer
Returns the number of clusters
"""
return model.n_clusters
def get_comp_measure():
"""
Generates comparison measure information for clustering models
Parameters
----------
Returns
-------
Returns Nyoka's ComparisonMeasure object
"""
comp_equation = pml.euclidean()
return pml.ComparisonMeasure(euclidean=comp_equation, kind=COMPARISON_MEASURE_KIND.DISTANCE.value)
def get_clustering_flds(col_names):
"""
Generates cluster fields for clustering models
Parameters
----------
col_names :
Contains list of feature/column names.
Returns
-------
clustering_flds: List
Returns Nyoka's ClusteringField object
"""
clustering_flds = []
for name in col_names:
clustering_flds.append(pml.ClusteringField(field=str(name)))
return clustering_flds
def get_nearestNeighbour_model(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values,model_name):
"""
Generates PMML elements for nearest neighbour model
Parameters
----------
model :
An instance of Scikit-learn model.
derived_col_names : List
Contains column names after preprocessing
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
nearest_neighbour_model :
Returns Nyoka's NearestNeighborModel object
"""
model_kwargs = get_model_kwargs(model, col_names, target_name, mining_imp_val,categoric_values)
nearest_neighbour_model = list()
nearest_neighbour_model.append(
pml.NearestNeighborModel(
modelName=model_name if model_name else model.__class__.__name__,
continuousScoringMethod=CONTINUOUS_SCORING_METHOD.AVERAGE.value,
algorithmName="KNN",
numberOfNeighbors=model.n_neighbors,
KNNInputs=get_knn_inputs(derived_col_names),
ComparisonMeasure=get_comparison_measure(model),
TrainingInstances=get_training_instances(model, derived_col_names, target_name),
**model_kwargs
)
)
return nearest_neighbour_model
def get_training_instances(model, derived_col_names, target_name):
"""
It returns the Training Instance element.
Parameters
----------
model :
An instance of Scikit-learn model.
derived_col_names : List
Contains column names after preprocessing
target_name : String
Name of the Target column.
Returns
-------
TrainingInstances :
Returns Nyoka's TrainingInstances object
"""
return pml.TrainingInstances(
InstanceFields=get_instance_fields(derived_col_names, target_name),
InlineTable=get_inline_table(model)
)
def get_inline_table(model):
"""
It Returns the Inline Table element of the model.
Parameters
----------
model :
An instance of Scikit-learn model.
Returns
-------
InlineTable :
Returns Nyoka's InlineTable object
"""
rows = []
x = model._tree.get_arrays()[0].tolist()
y = model._y.tolist()
X = []
for idx in range(len(model._tree.get_arrays()[0][0])):
X.append("x" + str(idx + 1))
for idx in range(len(x)):
row = pml.row()
row.elementobjs_ = ['y'] + X
if hasattr(model, 'classes_'):
row.y = model.classes_[y[idx]]
else:
row.y = y[idx]
for idx_2 in range(len(x[idx])):
exec("row." + X[idx_2] + "=" + str(x[idx][idx_2]))
rows.append(row)
return pml.InlineTable(row=rows)
def get_instance_fields(derived_col_names, target_name):
"""
It returns the Instance field element.
Parameters
----------
derived_col_names : List
Contains column names after preprocessing.
target_name : String
Name of the Target column.
Returns
-------
InstanceFields :
Returns Nyoka's InstanceFields object
"""
instance_fields = list()
instance_fields.append(pml.InstanceField(field=target_name, column="y"))
for (index, name) in enumerate(derived_col_names):
instance_fields.append(pml.InstanceField(field=str(name), column="x" + str(index + 1)))
return pml.InstanceFields(InstanceField=instance_fields)
def get_comparison_measure(model):
"""
It return the Comparison measure element for nearest neighbour model.
Parameters
----------
model :
An instance of Scikit-learn model.
Returns
-------
comp_measure :
Returns Nyoka's ComparisonMeasure object.
"""
if model.effective_metric_ == 'euclidean':
comp_measure = pml.ComparisonMeasure(euclidean=pml.euclidean(), kind=COMPARISON_MEASURE_KIND.DISTANCE.value)
elif model.effective_metric_ == 'minkowski':
comp_measure = pml.ComparisonMeasure(minkowski=pml.minkowski(p_parameter=model.p), kind=COMPARISON_MEASURE_KIND.DISTANCE.value)
elif model.effective_metric_ in ['manhattan','cityblock']:
comp_measure = pml.ComparisonMeasure(cityBlock=pml.cityBlock(), kind=COMPARISON_MEASURE_KIND.DISTANCE.value)
elif model.effective_metric_ == 'sqeuclidean':
comp_measure = pml.ComparisonMeasure(squaredEuclidean=pml.squaredEuclidean(), kind=COMPARISON_MEASURE_KIND.DISTANCE.value)
elif model.effective_metric_ == 'chebyshev':
comp_measure = pml.ComparisonMeasure(chebychev=pml.chebychev(), kind=COMPARISON_MEASURE_KIND.DISTANCE.value)
elif model.effective_metric_ == 'matching':
comp_measure = pml.ComparisonMeasure(simpleMatching=pml.simpleMatching(), kind=COMPARISON_MEASURE_KIND.SIMILARITY.value)
elif model.effective_metric_ == 'jaccard':
comp_measure = pml.ComparisonMeasure(jaccard=pml.jaccard(), kind=COMPARISON_MEASURE_KIND.SIMILARITY.value)
elif model.effective_metric_ == 'rogerstanimoto':
comp_measure = pml.ComparisonMeasure(tanimoto=pml.tanimoto(), kind=COMPARISON_MEASURE_KIND.SIMILARITY.value)
else:
raise NotImplementedError("{} metric is not implemented for KNN Model!".format(model.effective_metric_))
return comp_measure
def get_knn_inputs(col_names):
"""
It returns the KNN Inputs element.
Parameters
----------
col_names : List
Contains list of feature/column names.
Returns
-------
KNNInputs :
Returns Nyoka's KNNInputs object.
"""
knnInput = list()
for name in col_names:
knnInput.append(pml.KNNInput(field=str(name)))
return pml.KNNInputs(KNNInput=knnInput)
def get_naiveBayesModel(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values,model_name):
"""
Generates PMML elements for naive bayes models
Parameters
----------
model :
An instance of Scikit-learn model.
derived_col_names : List
Contains column names after preprocessing.
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
naive_bayes_model : List
Returns Nyoka's NaiveBayesModel
"""
model_kwargs = get_model_kwargs(model, col_names, target_name, mining_imp_val,categoric_values)
naive_bayes_model = list()
naive_bayes_model.append(pml.NaiveBayesModel(
modelName=model_name if model_name else model.__class__.__name__,
BayesInputs=get_bayes_inputs(model, derived_col_names),
BayesOutput=get_bayes_output(model, target_name),
threshold=get_threshold(),
**model_kwargs
))
return naive_bayes_model
def get_threshold():
"""
It returns the Threshold value for Naive Bayes models.
Returns
-------
Returns the Threshold value
"""
return '0.001'
def get_bayes_output(model, target_name):
"""
It returns the Bayes Output element of the model
Parameters
----------
model :
An instance of Scikit-learn model.
target_name : String
Name of the Target column.
Returns
-------
BayesOutput :
Returns Nyoka's BayesOutput object
"""
class_counts = model.class_count_
target_val_counts = pml.TargetValueCounts()
for name, count in zip(model.classes_, class_counts):
tr_val = pml.TargetValueCount(value=str(name), count=str(count))
target_val_counts.add_TargetValueCount(tr_val)
return pml.BayesOutput(
fieldName=target_name,
TargetValueCounts=target_val_counts
)
def get_bayes_inputs(model, derived_col_names):
"""
It returns the Bayes Input element of the naive bayes model .
Parameters
----------
model :
An instance of Scikit-learn model.
derived_col_names : List
Contains column names after preprocessing.
Returns
-------
bayes_inputs :
Returns Nyoka's BayesInput object.
"""
bayes_inputs = pml.BayesInputs()
for indx, name in enumerate(derived_col_names):
means = model.theta_[:, indx]
variances = model.sigma_[:, indx]
target_val_stats = pml.TargetValueStats()
for idx, val in enumerate(model.classes_):
target_val = pml.TargetValueStat(
val, GaussianDistribution=pml.GaussianDistribution(
mean="{:.16f}".format(means[idx]),
variance="{:.16f}".format(variances[idx])))
target_val_stats.add_TargetValueStat(target_val)
bayes_inputs.add_BayesInput(pml.BayesInput(fieldName=str(name),
TargetValueStats=target_val_stats))
return bayes_inputs
def get_supportVectorMachine_models(model, derived_col_names, col_names, target_names,
mining_imp_val, categoric_values,model_name):
"""
Generates PMML elements for support vector machine models
Parameters
----------
model :
An instance of Scikit-learn model.
derived_col_names : List
Contains column names after preprocessing.
col_names : List
Contains list of feature/column names.
target_names : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
supportVector_models : List
Returns Nyoka's SupportVectorMachineModel object
"""
model_kwargs = get_model_kwargs(model, col_names, target_names, mining_imp_val,categoric_values)
supportVector_models = list()
kernel_type = get_kernel_type(model)
supportVector_models.append(pml.SupportVectorMachineModel(
modelName=model_name if model_name else model.__class__.__name__,
classificationMethod=get_classificationMethod(model),
VectorDictionary=get_vectorDictionary(model, derived_col_names, categoric_values),
SupportVectorMachine=get_supportVectorMachine(model),
**kernel_type,
**model_kwargs
))
return supportVector_models
def get_ensemble_models(model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values,model_name):
"""
Generates PMML elemenets for ensemble models
Parameters
----------
model :
An instance of Scikit-learn model.
derived_col_names : List
Contains column names after preprocessing.
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value.
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
mining_models : List
Returns Nyoka's MiningModel object
"""
model_kwargs = get_model_kwargs(model, col_names, target_name, mining_imp_val,categoric_values)
if model.__class__.__name__ == 'GradientBoostingRegressor':
model_kwargs['Targets'] = get_targets(model, target_name)
mining_models = list()
mining_models.append(pml.MiningModel(
modelName=model_name if model_name else model.__class__.__name__,
Segmentation=get_outer_segmentation(model, derived_col_names, col_names, target_name,
mining_imp_val, categoric_values, model_name),
**model_kwargs
))
return mining_models
def get_targets(model, target_name):
"""
It returns the Target element of the model.
Parameters
----------
model :
A Scikit-learn model instance.
target_name : String
Name of the Target column.
Returns
-------
targets :
Returns Nyoka's Target object
"""
if model.__class__.__name__ == 'GradientBoostingRegressor':
targets = pml.Targets(
Target=[
pml.Target(
field=target_name,
rescaleConstant="{:.16f}".format(model.init_.mean),
rescaleFactor="{:.16f}".format(model.learning_rate)
)
]
)
else:
targets = pml.Targets(
Target=[
pml.Target(
field=target_name,
rescaleConstant="{:.16f}".format(model.base_score)
)
]
)
return targets
def get_multiple_model_method(model):
"""
It returns the type of multiple model method for MiningModels.
Parameters
----------
model :
A Scikit-learn model instance
Returns
-------
The multiple model method for a MiningModel.
"""
if model.__class__.__name__ == 'GradientBoostingClassifier':
return MULTIPLE_MODEL_METHOD.MODEL_CHAIN.value
elif model.__class__.__name__ == 'GradientBoostingRegressor':
return MULTIPLE_MODEL_METHOD.SUM.value
elif model.__class__.__name__ == 'RandomForestClassifier':
return MULTIPLE_MODEL_METHOD.MAJORITY_VOTE.value
elif model.__class__.__name__ in ['RandomForestRegressor','IsolationForest']:
return MULTIPLE_MODEL_METHOD.AVERAGE.value
def get_outer_segmentation(model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values,model_name):
"""
It returns the Segmentation element of a MiningModel.
Parameters
----------
model :
A Scikit-learn model instance.
derived_col_names : List
Contains column names after preprocessing.
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
Returns
-------
segmentation :
Nyoka's Segmentation object
"""
segmentation = pml.Segmentation(
multipleModelMethod=get_multiple_model_method(model),
Segment=get_segments(model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values,model_name)
)
return segmentation
def get_segments(model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values,model_name):
"""
It returns the Segment element of a Segmentation.
Parameters
----------
model :
A Scikit-learn model instance.
derived_col_names : List
Contains column names after preprocessing.
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
Returns
-------
segments :
Nyoka's Segment object
"""
segments = None
if 'GradientBoostingClassifier' in str(model.__class__):
segments = get_segments_for_gbc(model, derived_col_names, col_names, target_name,
mining_imp_val, categoric_values, model_name)
else:
segments = get_inner_segments(model, derived_col_names, col_names, 0)
return segments
def get_segments_for_gbc(model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values, model_name):
"""
It returns list of Segments element of a Segmentation.
Parameters
----------
model :
A Scikit-learn model instance.
derived_col_names : List
Contains column names after preprocessing.
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
Returns
-------
segments : List
Nyoka's Segment object
"""
segments = list()
out_field_names = list()
for estm_idx in range(len(model.estimators_[0])):
mining_fields_for_first = list()
for name in col_names:
mining_fields_for_first.append(pml.MiningField(name=name))
miningschema_for_first = pml.MiningSchema(MiningField=mining_fields_for_first)
output_fields = list()
output_fields.append(
pml.OutputField(
name='decisionFunction(' + str(estm_idx) + ')',
feature=RESULT_FEATURE.PREDICTED_VALUE.value,
dataType=DATATYPE.DOUBLE.value,
isFinalResult=False
)
)
if len(model.classes_) == 2:
output_fields.append(
pml.OutputField(
name='transformedDecisionFunction(0)',
feature=RESULT_FEATURE.TRANSFORMED_VALUE.value,
dataType=DATATYPE.DOUBLE.value,
isFinalResult=True,
Apply=pml.Apply(
function=FUNCTION.ADDITION.value,
Constant=[pml.Constant(
dataType=DATATYPE.DOUBLE.value,
valueOf_="{:.16f}".format(model.init_.prior)
)],
Apply_member=[pml.Apply(
function=FUNCTION.MULTIPLICATION.value,
Constant=[pml.Constant(
dataType=DATATYPE.DOUBLE.value,
valueOf_="{:.16f}".format(model.learning_rate)
)],
FieldRef=[pml.FieldRef(
field="decisionFunction(0)",
)]
)]
)
)
)
else:
output_fields.append(
pml.OutputField(
name='transformedDecisionFunction(' + str(estm_idx) + ')',
feature=RESULT_FEATURE.TRANSFORMED_VALUE.value,
dataType=DATATYPE.DOUBLE.value,
isFinalResult=True,
Apply=pml.Apply(
function=FUNCTION.ADDITION.value,
Constant=[pml.Constant(
dataType=DATATYPE.DOUBLE.value,
valueOf_="{:.16f}".format(model.init_.priors[estm_idx])
)],
Apply_member=[pml.Apply(
function=FUNCTION.MULTIPLICATION.value,
Constant=[pml.Constant(
dataType=DATATYPE.DOUBLE.value,
valueOf_="{:.16f}".format(model.learning_rate)
)],
FieldRef=[pml.FieldRef(
field="decisionFunction(" + str(estm_idx) + ")",
)]
)]
)
)
)
out_field_names.append('transformedDecisionFunction(' + str(estm_idx) + ')')
segments.append(
pml.Segment(
True_=pml.True_(),
id=str(estm_idx),
MiningModel=pml.MiningModel(
functionName=MINING_FUNCTION.REGRESSION.value,
modelName="MiningModel",
MiningSchema=miningschema_for_first,
Output=pml.Output(OutputField=output_fields),
Segmentation=pml.Segmentation(
multipleModelMethod=MULTIPLE_MODEL_METHOD.SUM.value,
Segment=get_inner_segments(model, derived_col_names,
col_names, estm_idx)
)
)
)
)
reg_model = get_regrs_models(model, out_field_names,out_field_names, target_name, mining_imp_val, categoric_values, model_name)[0]
reg_model.Output = None
if len(model.classes_) == 2:
reg_model.normalizationMethod=REGRESSION_NORMALIZATION_METHOD.LOGISTIC.value
else:
reg_model.normalizationMethod=REGRESSION_NORMALIZATION_METHOD.SOFTMAX.value
segments.append(
pml.Segment(
id=str(len(model.estimators_[0])),
True_=pml.True_(),
RegressionModel=reg_model
)
)
return segments
def get_inner_segments(model, derived_col_names, col_names, index):
"""
It returns the segments of a Segmentation.
Parameters
----------
model :
A Scikit-learn model instance.
derived_col_names : List
Contains column names after preprocessing.
col_names : List
Contains list of feature/column names.
index : Integer
The index of the estimator for the model
Returns
-------
segments : List
Nyoka's Segment object
"""
import numpy as np
segments = list()
for estm_idx in range(model.n_estimators):
if np.asanyarray(model.estimators_).ndim == 1:
estm = model.estimators_[estm_idx]
else:
estm = model.estimators_[estm_idx][index]
tree_features = estm.tree_.feature
features_ = list()
for feat in tree_features:
if feat != -2 and feat not in features_:
features_.append(feat)
if len(features_) != 0:
mining_fields = list()
for feat in col_names:
mining_fields.append(pml.MiningField(name=feat))
segments.append(
pml.Segment(
True_=pml.True_(),
id=str(estm_idx),
TreeModel=pml.TreeModel(
modelName=estm.__class__.__name__,
functionName=get_mining_func(estm),
splitCharacteristic=TREE_SPLIT_CHARACTERISTIC.MULTI.value,
MiningSchema=pml.MiningSchema(MiningField = mining_fields),
Node=get_node(estm, derived_col_names, model)
)
)
)
return segments
def get_classificationMethod(model):
"""
It returns the Classification method name for SVM models.
Parameters
----------
model :
A Scikit-learn model instance.
Returns
-------
Returns the classification method of the SVM model
"""
if model.__class__.__name__ == 'SVC':
return SVM_CLASSIFICATION_METHOD.OVO.value
else:
return SVM_CLASSIFICATION_METHOD.OVR.value
def get_vectorDictionary(model, derived_col_names, categoric_values):
"""
It return the Vector Dictionary element.
Parameters
----------
model :
A Scikit-learn model instance.
derived_col_names : List
Contains column names after preprocessing.
categoric_values : tuple
Contains Categorical attribute names and its values
Returns
-------
VectorDictionary :
Nyoka's VectorDictionary object
"""
fieldref_element = list()
for name in derived_col_names:
fieldref_element.append(pml.FieldRef(field=name))
vectorfields_element = pml.VectorFields(FieldRef=fieldref_element)
vec_id = list(model.support_)
vecinsts = list()
vecs = list(model.support_vectors_)
if model.support_vectors_.__class__.__name__ != 'csr_matrix':
for vec_idx in range(len(vecs)):
vecinsts.append(pml.VectorInstance(
id=vec_id[vec_idx],
REAL_SparseArray=pml.REAL_SparseArray(
n=len(fieldref_element),
Indices=([x for x in range(1, len(vecs[vec_idx]) + 1)]),
REAL_Entries=vecs[vec_idx].tolist()
)
))
else:
for vec_idx in range(len(vecs)):
vecinsts.append(pml.VectorInstance(
id=vec_id[vec_idx],
REAL_SparseArray=pml.REAL_SparseArray(
n=len(fieldref_element),
Indices=([x for x in range(1, len(vecs[vec_idx].todense().tolist()[0]) + 1)]),
REAL_Entries=vecs[vec_idx].todense().tolist()[0]
)
))
vd=pml.VectorDictionary(VectorFields=vectorfields_element, VectorInstance=vecinsts)
return vd
def get_kernel_type(model):
"""
It returns the kernel type element.
Parameters
----------
model :
A Scikit-learn model instance.
Returns
-------
kernel_kwargs : Dictionary
Get the respective kernel type of the SVM model.
"""
kernel_kwargs = dict()
if model.kernel == 'linear':
kernel_kwargs['LinearKernelType'] = pml.LinearKernelType(description='Linear Kernel Type')
elif model.kernel == 'poly':
kernel_kwargs['PolynomialKernelType'] = pml.PolynomialKernelType(description='Polynomial Kernel type',
gamma="{:.16f}".format(model._gamma),
coef0="{:.16f}".format(model.coef0),
degree=model.degree)
elif model.kernel == 'rbf':
kernel_kwargs['RadialBasisKernelType'] = pml.RadialBasisKernelType(description='Radial Basis Kernel Type',
gamma="{:.16f}".format(model._gamma))
elif model.kernel == 'sigmoid':
kernel_kwargs['SigmoidKernelType'] = pml.SigmoidKernelType(description='Sigmoid Kernel Type',
gamma="{:.16f}".format(model._gamma),
coef0="{:.16f}".format(model.coef0))
else:
raise NotImplementedError("{} kernel is not implemented!".format(model.kernel))
return kernel_kwargs
def get_supportVectorMachine(model):
"""
Generates PMML elements for support vector machine models
Parameters
----------
model :
A Scikit-learn model instance.
Returns
-------
support_vector_machines : List
Nyoka's SupportVectorMachineModel object
"""
support_vector_machines = list()
if model.__class__.__name__ in ['SVR','OneClassSVM']:
support_vector = list()
for sv in model.support_:
support_vector.append(pml.SupportVector(vectorId=sv))
support_vectors = pml.SupportVectors(SupportVector=support_vector)
coefficient = list()
absoValue = model.intercept_[0]
if model.dual_coef_.__class__.__name__ != 'csr_matrix':
for coef in model.dual_coef_:
for num in coef:
coefficient.append(pml.Coefficient(value="{:.16f}".format(num)))
else:
dual_coefficent=model.dual_coef_.data
for num in dual_coefficent:
coefficient.append(pml.Coefficient(value="{:.16f}".format(num)))
coeff = pml.Coefficients(absoluteValue=absoValue, Coefficient=coefficient)
support_vector_machines.append(pml.SupportVectorMachine(SupportVectors=support_vectors, Coefficients=coeff))
else:
import numpy as np
support_vector_locs = np.cumsum(np.hstack([[0], model.n_support_]))
n_class = model.dual_coef_.shape[0] + 1
coef_abs_val_index = 0
for class1 in range(n_class):
sv1 = model.support_[support_vector_locs[class1]:support_vector_locs[class1 + 1]]
for class2 in range(class1 + 1, n_class):
svs = list()
coefs = list()
sv2 = model.support_[support_vector_locs[class2]:support_vector_locs[class2 + 1]]
svs.append((list(sv1) + list(sv2)))
alpha1 = model.dual_coef_[class2 - 1, support_vector_locs[class1]:support_vector_locs[class1 + 1]]
alpha2 = model.dual_coef_[class1, support_vector_locs[class2]:support_vector_locs[class2 + 1]]
coefs.append((list(alpha1) + list(alpha2)))
all_svs = list()
for sv in (svs[0]):
all_svs.append(pml.SupportVector(vectorId=sv))
all_coefs = list()
for coef in (coefs[0]):
all_coefs.append(pml.Coefficient(value="{:.16f}".format(coef)))
coef_abs_value = model.intercept_[coef_abs_val_index]
coef_abs_val_index += 1
if len(model.classes_) == 2:
support_vector_machines.append(
pml.SupportVectorMachine(
targetCategory=model.classes_[class1],
alternateTargetCategory=model.classes_[class2],
SupportVectors=pml.SupportVectors(SupportVector=all_svs),
Coefficients=pml.Coefficients(absoluteValue="{:.16f}".format(coef_abs_value), Coefficient=all_coefs)
)
)
else:
support_vector_machines.append(
pml.SupportVectorMachine(
targetCategory=model.classes_[class2],
alternateTargetCategory=model.classes_[class1],
SupportVectors=pml.SupportVectors(SupportVector=all_svs),
Coefficients=pml.Coefficients(absoluteValue="{:.16f}".format(coef_abs_value), Coefficient=all_coefs)
)
)
return support_vector_machines
def get_tree_models(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values,model_name):
"""
Generates PMML elements for tree models
Parameters
----------
model :
A Scikit-learn model instance.
derived_col_names :
Contains column names after preprocessing.
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
tree_models : List
Nyoka's TreeModel object
"""
model_kwargs = get_model_kwargs(model, col_names, target_name, mining_imp_val,categoric_values)
tree_models = list()
tree_models.append(pml.TreeModel(
modelName=model_name if model_name else model.__class__.__name__,
Node=get_node(model, derived_col_names),
**model_kwargs
))
return tree_models
def get_neural_models(model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values,model_name):
"""
Generates PMML elements for neural network models
Parameters
----------
model :
A Scikit-learn model instance.
derived_col_names : List
Contains column names after preprocessing.
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value.
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
neural_model : List
Nyoka's NeuralNetwork object
"""
model_kwargs = get_model_kwargs(model, col_names, target_name, mining_imp_val,categoric_values)
neural_model = list()
neural_layers, neural_outs = get_neural_layer(model, derived_col_names, target_name)
neural_model.append(pml.NeuralNetwork(
modelName=model_name if model_name else model.__class__.__name__,
threshold='0',
altitude='1.0',
activationFunction=get_funct(model),
NeuralInputs = get_neuron_input(derived_col_names),
NeuralLayer = neural_layers,
NeuralOutputs = neural_outs,
**model_kwargs
))
return neural_model
def get_funct(sk_model):
"""
It returns the activation fucntion for a neural network model.
Parameters
----------
model :
A Scikit-learn model instance.
Returns
-------
a_fn : String
Returns the activation function.
"""
a_fn = sk_model.activation
if a_fn =='relu':
a_fn = NN_ACTIVATION_FUNCTION.RECTIFIER.value
return a_fn
def get_regrs_models(model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values,model_name):
"""
Generates PMML elements for linear models
Parameters
----------
model :
A Scikit-learn model instance.
derived_col_names : List
Contains column names after preprocessing.
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
regrs_models : List
Nyoka's RegressionModel object
"""
model_kwargs = get_model_kwargs(model, col_names, target_name, mining_imp_val, categoric_values)
if model.__class__.__name__ not in ['LinearRegression','LinearSVR']:
model_kwargs['normalizationMethod'] = REGRESSION_NORMALIZATION_METHOD.LOGISTIC.value
regrs_models = list()
regrs_models.append(pml.RegressionModel(
modelName=model_name if model_name else model.__class__.__name__,
RegressionTable=get_regrs_tabl(model, derived_col_names, target_name, categoric_values),
**model_kwargs
))
return regrs_models
def get_regrs_tabl(model, feature_names, target_name, categoric_values):
"""
It returns the Regression Table element of the model.
Parameters
----------
model :
A Scikit-learn model instance.
derived_col_names : List
Contains column names after preprocessing.
target_name : String
Name of the Target column.
categoric_values : tuple
Contains Categorical attribute names and its values
Returns
-------
merge : List
Nyoka's RegressionTable object
"""
merge = list()
if hasattr(model, 'intercept_'):
import numpy as np
func_name = get_mining_func(model)
inter = model.intercept_
model_coef = model.coef_
target_classes = target_name
if not hasattr(inter, '__iter__') or model.__class__.__name__ in ['LinearRegression','LinearSVR']:
inter = np.array([inter])
target_classes = [target_classes]
model_coef = np.ravel(model_coef)
model_coef = model_coef.reshape(1, model_coef.shape[0])
target_cat = None
else:
target_classes = model.classes_
max_target_index = len(target_classes) - 1
target_cat = target_classes[max_target_index]
if hasattr(model_coef[0],"__len__"):
model_coef = model_coef[0]
reg_preds=list()
for idx, feat in enumerate(feature_names):
reg_preds.append(pml.NumericPredictor(name=feat, coefficient="{:.16f}".format(model_coef[idx])))
merge.append(
pml.RegressionTable(
intercept="{:.16f}".format(inter.item()),
targetCategory=target_cat,
NumericPredictor=reg_preds
)
)
if func_name != MINING_FUNCTION.REGRESSION.value:
merge.append(
pml.RegressionTable(
intercept="0.0",
targetCategory=target_classes[0]
)
)
else:
if len(model.classes_) == 2:
merge.append(
pml.RegressionTable(
NumericPredictor=[pml.NumericPredictor(coefficient='1.0',name=feature_names[0])],
intercept='0.0',
targetCategory=str(model.classes_[-1])
)
)
merge.append(
pml.RegressionTable(intercept='0.0', targetCategory=str(model.classes_[0]))
)
else:
for feat_idx in range(len(feature_names)):
merge.append(
pml.RegressionTable(
NumericPredictor=[pml.NumericPredictor(coefficient='1.0',name=feature_names[feat_idx])],
intercept='0.0',
targetCategory=str(model.classes_[feat_idx])
)
)
return merge
def get_node(model, features_names, main_model=None):
"""
It return the Node element of the model.
Parameters
----------
model :
An instance of the estimator of the tree object.
features_names : List
Contains the list of feature/column name.
main_model :
A Scikit-learn model instance.
Returns
-------
Get all the underlying Nodes.
"""
tree = model.tree_
node_samples = tree.n_node_samples
if main_model and main_model.__class__.__name__ == 'RandomForestClassifier':
classes = main_model.classes_
elif hasattr(model,'classes_'):
classes = model.classes_
tree_leaf = -1
def _getNode(idx,parent=None, cond=None):
simple_pred_cond = None
if cond:
simple_pred_cond = cond
node = pml.Node(id=idx, recordCount=float(tree.n_node_samples[idx]))
if simple_pred_cond:
node.SimplePredicate = simple_pred_cond
else:
node.True_ = pml.True_()
if tree.children_left[idx] != tree_leaf:
fieldName = features_names[tree.feature[idx]]
prnt = None
if model.__class__.__name__ == "ExtraTreeRegressor":
prnt = parent + 1
thresh = 0
try:
rnd_ = str(tree.threshold[idx]).split(".")[1]
thresh = round(tree.threshold[idx], min(rnd_, 16))
except:
thresh = tree.threshold[idx]
simplePredicate = pml.SimplePredicate(field=fieldName, operator=SIMPLE_PREDICATE_OPERATOR.LESS_OR_EQUAL.value,\
value = str(thresh))
# value="{:.16f}".format(tree.threshold[idx]))
left_child = _getNode(tree.children_left[idx],prnt, simplePredicate)
simplePredicate = pml.SimplePredicate(field=fieldName, operator=SIMPLE_PREDICATE_OPERATOR.GREATER_THAN.value, \
value= str(thresh))
# value="{:.16f}".format(tree.threshold[idx]))
right_child = _getNode(tree.children_right[idx],prnt, simplePredicate)
node.add_Node(left_child)
node.add_Node(right_child)
else:
nodeValue = list(tree.value[idx][0])
lSum = float(sum(nodeValue))
if model.__class__.__name__ == 'DecisionTreeClassifier':
probs = [x / lSum for x in nodeValue]
score_dst = []
for i in range(len(probs)):
score_dst.append(pml.ScoreDistribution(confidence=probs[i], recordCount=float(nodeValue[i]),
value=classes[i]))
node.ScoreDistribution = score_dst
node.score = classes[probs.index(max(probs))]
else:
if model.__class__.__name__ == "ExtraTreeRegressor":
nd_sam=node_samples[int(idx)]
node.score = "{:.16f}".format(parent+avgPathLength(nd_sam))
else:
node.score="{:.16f}".format(lSum)
return node
if model.__class__.__name__ == "ExtraTreeRegressor":
return _getNode(0,0)
else:
return _getNode(0)
def avgPathLength(n):
"""
Generates average path length for Isolation forest models
Parameters
----------
n : int
Number of samples
Returns
-------
The average path length
"""
if n<=1.0:
return 1.0
return 2.0*(math.log(n-1.0)+0.57721566) - 2.0*((n-1.0)/n)
def get_output(model, target_name):
"""
It returns the output element of the model.
Parameters
----------
model :
A Scikit-learn model instance.
target_name : String
Name of the Target column.
Returns
-------
Output :
Nyoka's Output object
"""
mining_func = get_mining_func(model)
output_fields = list()
if not has_target(model):
output_fields.append(pml.OutputField(
name='predicted',
feature=RESULT_FEATURE.PREDICTED_VALUE.value,
optype=OPTYPE.CONTINUOUS.value,
dataType=DATATYPE.DOUBLE.value
))
else:
alt_target_name = 'predicted_' + target_name
if mining_func == MINING_FUNCTION.CLASSIFICATION.value:
for cls in model.classes_:
output_fields.append(pml.OutputField(
name='probability_' + str(cls),
feature=RESULT_FEATURE.PROBABILITY.value,
optype=OPTYPE.CONTINUOUS.value,
dataType=DATATYPE.DOUBLE.value,
value=str(cls)
))
output_fields.append(pml.OutputField(
name=alt_target_name,
feature=RESULT_FEATURE.PREDICTED_VALUE.value,
optype=OPTYPE.CATEGORICAL.value,
dataType=get_dtype(model.classes_[0])))
else:
output_fields.append(pml.OutputField(
name=alt_target_name,
feature=RESULT_FEATURE.PREDICTED_VALUE.value,
optype=OPTYPE.CONTINUOUS.value,
dataType=DATATYPE.DOUBLE.value))
return pml.Output(OutputField=output_fields)
def get_mining_func(model):
"""
It returns the name of the mining function of the model.
Parameters
----------
model :
A Scikit-learn model instance.
Returns
-------
func_name : String
Returns the function name of the model
"""
import numpy as np
if not hasattr(model, 'classes_'):
if hasattr(model,'n_clusters'):
func_name = MINING_FUNCTION.CLUSTERING.value
else:
func_name = MINING_FUNCTION.REGRESSION.value
else:
if isinstance(model.classes_, np.ndarray):
func_name = MINING_FUNCTION.CLASSIFICATION.value
else:
func_name = MINING_FUNCTION.REGRESSION.value
return func_name
def get_mining_schema(model, feature_names, target_name, mining_imp_val, categoric_values):
"""
It returns the Mining Schema of the model.
Parameters
----------
model :
A Scikit-learn model instance.
feature_names : List
Contains the list of feature/column name.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value.
categoric_values : tuple
Contains Categorical attribute names and its values
Returns
-------
MiningSchema :
Nyoka's MiningSchema object
"""
if mining_imp_val:
mining_attributes = mining_imp_val[0]
mining_strategy = mining_imp_val[1]
mining_replacement_val = mining_imp_val[2]
n_features = len(feature_names)
features_pmml_optype = [OPTYPE.CONTINUOUS.value] * n_features
features_pmml_utype = [FIELD_USAGE_TYPE.ACTIVE.value] * n_features
target_pmml_utype = FIELD_USAGE_TYPE.TARGET.value
mining_func = get_mining_func(model)
if mining_func == MINING_FUNCTION.CLASSIFICATION.value:
target_pmml_optype = OPTYPE.CATEGORICAL.value
elif mining_func == MINING_FUNCTION.REGRESSION.value:
target_pmml_optype = OPTYPE.CONTINUOUS.value
mining_flds = list()
mining_name_stored = list()
# handling impute pre processing
if mining_imp_val:
for mining_item, mining_idx in zip(mining_attributes, range(len(mining_attributes))):
for feat_name,feat_idx in zip(feature_names, range(len(feature_names))):
if feat_name in mining_item:
if feat_name not in mining_name_stored:
impute_index = mining_item.index(feat_name)
mining_flds.append(pml.MiningField(name=str(feat_name),
optype=features_pmml_optype[feat_idx],
missingValueReplacement=mining_replacement_val[mining_idx][
impute_index],
missingValueTreatment=mining_strategy[mining_idx],
usageType=features_pmml_utype[feat_idx]))
mining_name_stored.append(feat_name)
if len(categoric_values) > 0:
for cls_attr in categoric_values[1]:
mining_flds.append(pml.MiningField(
name=cls_attr,
usageType=FIELD_USAGE_TYPE.ACTIVE.value,
optype=OPTYPE.CATEGORICAL.value
))
mining_name_stored.append(cls_attr)
for feat_name, feat_idx in zip(feature_names, range(len(feature_names))):
if feat_name not in mining_name_stored:
mining_flds.append(pml.MiningField(name=str(feat_name),
optype=features_pmml_optype[feat_idx],
usageType=features_pmml_utype[feat_idx]))
if model.__class__.__name__ not in ['KMeans', 'IsolationForest', 'OneClassSVM']:
mining_flds.append(pml.MiningField(name=target_name,
optype=target_pmml_optype,
usageType=target_pmml_utype))
return pml.MiningSchema(MiningField=mining_flds)
def get_neuron_input(feature_names):
"""
It returns the Neural Input element.
Parameters
----------
feature_names : List
Contains the list of feature/column name.
Returns
-------
neural_input_element :
Returns Nyoka's NeuralInput object
"""
neural_input = list()
for features in feature_names:
field_ref = pml.FieldRef(field = str(features))
derived_flds = pml.DerivedField(optype = OPTYPE.CONTINUOUS.value, dataType = DATATYPE.DOUBLE.value, FieldRef = field_ref)
class_node = pml.NeuralInput(id = str(features), DerivedField = derived_flds)
neural_input.append(class_node)
neural_input_element = pml.NeuralInputs(NeuralInput = neural_input, numberOfInputs = str(len(neural_input)))
return neural_input_element
def get_neural_layer(model, feature_names, target_name):
"""
It returns the Neural Layer and Neural Ouptput element.
Parameters
----------
model :
A Scikit-learn model instance.
feature_names : List
Contains the list of feature/column name.
target_name : String
Name of the Target column.
Returns
-------
all_neuron_layer : List
Nyoka's NeuralLayer object
neural_output_element :
Nyoka's NeuralOutput object
"""
weight = model.coefs_
bias = model.intercepts_
last_layer = bias[-1]
hidden_layer_sizes = model.hidden_layer_sizes
hidden_layers = list(hidden_layer_sizes)
hidden_layers.append(len(last_layer))
neuron = list()
all_neuron_layer = list()
input_features = feature_names
neuron_id = list()
for count in range(len(hidden_layers)):
for count1 in range(hidden_layers[count]):
con = list()
for count2 in range(len(input_features)):
con.append(pml.Con(from_ = input_features[count2], weight = format(weight[count][count2][count1])))
neuron.append(pml.Neuron(id = str(count)+str(count1), bias = format(bias[count][count1]),Con = con))
neuron_id.append(str(count)+str(count1))
all_neuron_layer.append(pml.NeuralLayer(Neuron = neuron))
input_features = neuron_id
neuron_id = list()
neuron = list()
all_neuron_layer[-1].activationFunction = NN_ACTIVATION_FUNCTION.IDENTITY.value
if hasattr(model, "classes_"):
if len(model.classes_) == 2:
bias1=[1.0,0.0]
weight1=[-1.0,1.0]
con = list()
linear = ['logistic/1']
i_d = ['false', 'true']
con.append(pml.Con(from_ = input_features[0], weight = 1.0))
neuron.append(pml.Neuron(id = linear[0], bias = ('0.0'), Con = con))
all_neuron_layer.append(pml.NeuralLayer(activationFunction = NN_ACTIVATION_FUNCTION.LOGISTIC.value, Neuron = neuron))
neuron = list()
con = list()
for num in range(2):
con.append(pml.Con(from_ = linear[0], weight = format(weight1[num])))
neuron.append(pml.Neuron(id = i_d[num], bias = format(bias1[num]), Con = con))
con = list()
all_neuron_layer.append(pml.NeuralLayer(activationFunction = NN_ACTIVATION_FUNCTION.IDENTITY.value, Neuron = neuron))
input_features = i_d
else:
all_neuron_layer[-1].normalizationMethod = model.out_activation_
neural_output = list()
for values, count in zip(model.classes_, range(len(model.classes_))):
norm_discrete = pml.NormDiscrete(field = target_name, value = str(values))
derived_flds = pml.DerivedField(optype = OPTYPE.CATEGORICAL.value, dataType = DATATYPE.DOUBLE.value,
NormDiscrete = norm_discrete)
if len(input_features)==1:
class_node = pml.NeuralOutput(outputNeuron = input_features[0], DerivedField = derived_flds)
else:
class_node = pml.NeuralOutput(outputNeuron = input_features[count],DerivedField = derived_flds)
neural_output.append(class_node)
neural_output_element = pml.NeuralOutputs(numberOfOutputs = None, Extension = None,
NeuralOutput = neural_output)
else:
neural_output = list()
fieldRef = pml.FieldRef(field = target_name)
derived_flds = pml.DerivedField(optype = OPTYPE.CONTINUOUS.value, dataType = DATATYPE.DOUBLE.value, FieldRef = fieldRef)
class_node = pml.NeuralOutput(outputNeuron = input_features[0], DerivedField = derived_flds)
neural_output.append(class_node)
neural_output_element = pml.NeuralOutputs(numberOfOutputs = None, Extension = None, NeuralOutput = neural_output)
return all_neuron_layer, neural_output_element
def get_super_cls_names(model_inst):
"""
It returns the set of Super class of the model.
Parameters
-------
model_inst :
Instance of the scikit-learn model
Returns
-------
parents : Set
Returns all the parent class of the model instance.
"""
def super_cls_names(cls):
nonlocal parents
parents.add(cls.__name__)
for super_cls in cls.__bases__:
super_cls_names(super_cls)
cls = model_inst.__class__
parents = set()
super_cls_names(cls)
return parents
from nyoka import metadata
def get_dtype(feat_value):
"""
It return the data type of the value.
Parameters
----------
feat_value :
Contains a value for finding the its data type.
Returns
-------
Returns the respective data type of that value.
"""
data_type=feat_value.__class__.__name__
if 'float' in data_type:
return DATATYPE.DOUBLE.value
if 'int' in data_type:
return DATATYPE.INTEGER.value
if 'str' in data_type:
return DATATYPE.STRING.value
def get_data_dictionary(model, feature_names, target_name, categoric_values):
"""
It returns the Data Dictionary element.
Parameters
----------
model :
A Scikit-learn model instance.
feature_names : List
Contains the list of feature/column name.
target_name : List
Name of the Target column.
categoric_values : tuple
Contains Categorical attribute names and its values
Returns
-------
data_dict :
Returns Nyoka's DataDictionary object
"""
categoric_feature_name = list()
if categoric_values:
categoric_labels = categoric_values[0]
categoric_feature_name = categoric_values[1]
target_attr_values = []
n_features = len(feature_names)
features_pmml_optype = [OPTYPE.CONTINUOUS.value] * n_features
features_pmml_dtype = [DATATYPE.DOUBLE.value] * n_features
mining_func = get_mining_func(model)
if mining_func == MINING_FUNCTION.CLASSIFICATION.value:
target_pmml_optype = OPTYPE.CATEGORICAL.value
target_pmml_dtype = get_dtype(model.classes_[0])
target_attr_values = model.classes_.tolist()
elif mining_func == MINING_FUNCTION.REGRESSION.value:
target_pmml_optype = OPTYPE.CONTINUOUS.value
target_pmml_dtype = DATATYPE.DOUBLE.value
data_fields = list()
if categoric_values:
for class_list, attr_for_class in zip(categoric_labels, categoric_feature_name):
category_flds = pml.DataField(name=str(attr_for_class), optype=OPTYPE.CATEGORICAL.value,
dataType=get_dtype(class_list[0]) if class_list else DATATYPE.STRING.value)
if class_list:
for values in class_list:
category_flds.add_Value(pml.Value(value=str(values)))
data_fields.append(category_flds)
attr_without_class_attr = [feat_name for feat_name in feature_names if feat_name not in categoric_feature_name]
for feature_idx, feat_name in enumerate(attr_without_class_attr):
data_fields.append(pml.DataField(name=str(feat_name),
optype=features_pmml_optype[feature_idx],
dataType=features_pmml_dtype[feature_idx]))
if model.__class__.__name__ not in ['KMeans', 'IsolationForest', 'OneClassSVM']:
class_node = pml.DataField(name=str(target_name), optype=target_pmml_optype,
dataType=target_pmml_dtype)
for class_value in target_attr_values:
class_node.add_Value(pml.Value(value=str(class_value)))
data_fields.append(class_node)
data_dict = pml.DataDictionary(numberOfFields=len(data_fields), DataField=data_fields)
return data_dict
def has_target(model):
"""
Checks whether a given model has target or not
Parameters
----------
model :
Scikit-learn's model object
Returns
-------
Boolean value
"""
target_less_models = ['OneClassSVM','IsolationForest', ]
if model.__class__.__name__ in target_less_models:
return False
else:
return True