from __future__ import absolute_import
import sys, os
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
sys.path.append(BASE_DIR)
import PMML44 as pml
import json
import skl_to_pmml as sklToPmml
import pre_process as pp
from datetime import datetime
from enums import *
def xgboost_to_pmml(pipeline, col_names, target_name, pmml_f_name='from_xgboost.pmml',model_name=None,description=None):
"""
Exports xgboost model object into pmml
Parameters
----------
pipeline :
Contains an instance of Pipeline with preprocessing and final estimator
col_names : List
Contains list of feature/column names.
target_name : String
Name of the target column.
pmml_f_name : String
Name of the pmml file. (Default='from_xgboost.pmml')
model_name : string (optional)
Name of the model
description : string (optional)
Description for the model
Returns
-------
Generates the PMML object and exports it to `pmml_f_name`
"""
try:
model = pipeline.steps[-1][1]
except:
raise TypeError("Exporter expects pipeleine_instance and not an estimator_instance")
else:
if col_names.__class__.__name__ == "ndarray":
col_names = col_names.tolist()
ppln_sans_predictor = pipeline.steps[:-1]
trfm_dict_kwargs = dict()
derived_col_names = col_names
categoric_values = tuple()
mining_imp_val = tuple()
if ppln_sans_predictor:
pml_pp = pp.get_preprocess_val(ppln_sans_predictor, col_names, model)
trfm_dict_kwargs['TransformationDictionary'] = pml_pp['trfm_dict']
derived_col_names = pml_pp['derived_col_names']
col_names = pml_pp['preprocessed_col_names']
categoric_values = pml_pp['categorical_feat_values']
mining_imp_val = pml_pp['mining_imp_values']
PMML_kwargs = get_PMML_kwargs(model,
derived_col_names,
col_names,
target_name,
mining_imp_val,
categoric_values,
model_name)
pmml = pml.PMML(
version=PMML_SCHEMA.VERSION.value,
Header=sklToPmml.get_header(description),
DataDictionary=sklToPmml.get_data_dictionary(model, col_names, target_name, categoric_values),
**trfm_dict_kwargs,
**PMML_kwargs
)
pmml.export(outfile=open(pmml_f_name, "w"), level=0)
def get_PMML_kwargs(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values,model_name):
"""
It returns all the pmml elements.
Parameters
----------
model :
Contains XGBoost model object.
derived_col_names : List
Contains column names after preprocessing
col_names : List
Contains list of feature/column names.
target_name : String
Name of the target column .
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
algo_kwargs : { dictionary element}
Get the PMML model argument based on XGBoost model object
"""
algo_kwargs = {'MiningModel': get_ensemble_models(model,
derived_col_names,
col_names,
target_name,
mining_imp_val,
categoric_values,
model_name)}
return algo_kwargs
def get_ensemble_models(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values,model_name):
"""
It returns the Mining Model element of the model
Parameters
----------
model :
Contains Xgboost model object.
derived_col_names : List
Contains column names after preprocessing.
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value.
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
mining_models :
Returns Nyoka's MiningModel object
"""
model_kwargs = sklToPmml.get_model_kwargs(model, col_names, target_name, mining_imp_val, categoric_values)
if 'XGBRegressor' in str(model.__class__):
model_kwargs['Targets'] = sklToPmml.get_targets(model, target_name)
mining_models = list()
mining_models.append(pml.MiningModel(
modelName=model_name if model_name else "XGBoostModel",
Segmentation=get_outer_segmentation(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values,model_name),
**model_kwargs
))
return mining_models
def get_outer_segmentation(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values,model_name):
"""
It returns the Segmentation element of the model.
Parameters
----------
model :
Contains Xgboost model object.
derived_col_names : List
Contains column names after preprocessing.
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
segmentation :
Returns Nyoka's Segmentation object
"""
if 'XGBRegressor' in str(model.__class__):
segmentation=get_segments(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values,model_name)
else:
segmentation = pml.Segmentation(
multipleModelMethod=get_multiple_model_method(model),
Segment=get_segments(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values,model_name)
)
return segmentation
def get_segments(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values,model_name):
"""
It returns the Segment element of the model.
Parameters
----------
model :
Contains Xgboost model object.
derived_col_names : List
Contains column names after preprocessing.
col_names : List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
segment :
Nyoka's Segment object
"""
segments = None
if 'XGBClassifier' in str(model.__class__):
segments=get_segments_for_xgbc(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values,model_name)
elif 'XGBRegressor' in str(model.__class__):
segments=get_segments_for_xgbr(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values)
return segments
def get_segments_for_xgbr(model, derived_col_names, feature_names, target_name, mining_imp_val,categorical_values):
"""
It returns all the Segments element of the model
Parameters
----------
model :
Contains Xgboost model object.
derived_col_names : List
Contains column names after preprocessing.
feature_names : List
Contains list of feature/column names.
target_name : List
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
Returns
-------
segment :
Nyoka's Segment object
"""
segments = list()
get_nodes_in_json_format = []
for i in range(model.n_estimators):
get_nodes_in_json_format.append(json.loads(model._Booster.get_dump(dump_format='json')[i]))
segmentation = pml.Segmentation(multipleModelMethod=MULTIPLE_MODEL_METHOD.SUM.value,
Segment=generate_Segments_Equal_To_Estimators(get_nodes_in_json_format, derived_col_names,
feature_names))
return segmentation
def mining_Field_For_First_Segment(feature_names):
"""
It returns the Mining Schema of the First Segment.
Parameters
----------
feature_names : List
Contains list of feature/column names.
Returns
-------
mining_schema_for_1st_segment :
Nyoka's MiningSchema object
"""
mining_fields_1st_segment = []
for name in feature_names:
mining_fields_1st_segment.append(pml.MiningField(name=name))
mining_schema_for_1st_segment = pml.MiningSchema(MiningField=mining_fields_1st_segment)
return mining_schema_for_1st_segment
def replace_name_with_derivedColumnNames(original_name, derived_col_names):
"""
It replace the default names with the names of the attributes.
Parameters
----------
original_name : List
The name of the node retrieve from model
derived_col_names : List
The name of the derived attributes.
Returns
-------
col_name :
Returns the derived column name/original column name.
"""
new = str.replace(original_name, 'f', '')
if new.isdigit():
col_name = derived_col_names[int(new)]
else:
col_name = original_name
return col_name
def create_node(obj, main_node,derived_col_names):
"""
It creates nodes.
Parameters
----------
obj : Json
Contains nodes in json format.
main_node :
Contains node build with Nyoka class.
derived_col_names : List
Contains column names after preprocessing.
"""
def create_left_node(obj,derived_col_names):
nd = pml.Node()
nd.set_SimplePredicate(
pml.SimplePredicate(field=replace_name_with_derivedColumnNames(obj['split'], derived_col_names),\
operator=SIMPLE_PREDICATE_OPERATOR.LESS_THAN.value, value="{:.16f}".format(obj['split_condition'])))
create_node(obj['children'][0], nd, derived_col_names)
return nd
def create_right_node(obj,derived_col_names):
nd = pml.Node()
nd.set_SimplePredicate(
pml.SimplePredicate(field=replace_name_with_derivedColumnNames(obj['split'], derived_col_names),\
operator=SIMPLE_PREDICATE_OPERATOR.GREATER_OR_EQUAL.value, value="{:.16f}".format(obj['split_condition'])))
create_node(obj['children'][1], nd, derived_col_names)
return nd
if 'split' not in obj:
main_node.set_score(obj['leaf'])
else:
main_node.add_Node(create_left_node(obj,derived_col_names))
main_node.add_Node(create_right_node(obj,derived_col_names))
def generate_Segments_Equal_To_Estimators(val, derived_col_names, col_names):
"""
It returns number of Segments equal to the estimator of the model.
Parameters
----------
val : List
Contains a list of well structured node for binary classification/inner segments for multi-class classification
derived_col_names : List
Contains column names after preprocessing.
col_names : List
Contains list of feature/column names.
Returns
-------
segments_equal_to_estimators:
Nyoka's Segment object
"""
segments_equal_to_estimators = []
for i in range(len(val)):
main_node = pml.Node(True_=pml.True_())
m_flds = []
mining_field_for_innner_segments = col_names
create_node(val[i], main_node, derived_col_names)
for name in mining_field_for_innner_segments:
m_flds.append(pml.MiningField(name=name))
segments_equal_to_estimators.append((pml.Segment(id=i + 1, True_=pml.True_(),
TreeModel=pml.TreeModel(functionName=MINING_FUNCTION.REGRESSION.value,
modelName="DecisionTreeModel",
missingValueStrategy="none",
noTrueChildStrategy="returnLastPrediction",
splitCharacteristic=TREE_SPLIT_CHARACTERISTIC.MULTI.value,
Node=main_node,
MiningSchema=pml.MiningSchema(
MiningField=m_flds)))))
return segments_equal_to_estimators
def add_segmentation(model,segments_equal_to_estimators,mining_schema_for_1st_segment,out,id):
"""
It returns segmentation for a mining model
Parameters
----------
model :
Contains Xgboost model object.
segments_equal_to_estimators : List
Contains List Segements equals to the number of the estimators of the model.
mining_schema_for_1st_segment :
Contains Mining Schema for the First Segment
out :
Contains the Output element
id : Integer
Index of the Segements
Returns
-------
segments_equal_to_estimators:
Returns Nyoka's Segment object
"""
segmentation = pml.Segmentation(multipleModelMethod=MULTIPLE_MODEL_METHOD.SUM.value, Segment=segments_equal_to_estimators)
mining_model = pml.MiningModel(functionName=MINING_FUNCTION.REGRESSION.value, modelName="MiningModel", MiningSchema=mining_schema_for_1st_segment,
Output=out, Segmentation=segmentation)
if model.n_classes_==2:
First_segment = pml.Segment(True_=pml.True_(), id=id, MiningModel=mining_model)
return First_segment
else:
segments_equal_to_class = pml.Segment(True_=pml.True_(), id=id + 1, MiningModel=mining_model)
return segments_equal_to_class
def get_segments_for_xgbc(model, derived_col_names, feature_names, target_name, mining_imp_val,categoric_values,model_name):
"""
It returns all the segments of the Xgboost classifier.
Parameters
----------
model :
Contains Xgboost model object.
derived_col_names : List
Contains column names after preprocessing.
feature_names: List
Contains list of feature/column names.
target_name : String
Name of the Target column.
mining_imp_val : tuple
Contains the mining_attributes,mining_strategy, mining_impute_value
categoric_values : tuple
Contains Categorical attribute names and its values
model_name : string
Name of the model
Returns
-------
regrs_models :
Returns Nyoka's Segment object
"""
segments = list()
if model.n_classes_ == 2:
get_nodes_in_json_format=[]
for i in range(model.n_estimators):
get_nodes_in_json_format.append(json.loads(model._Booster.get_dump(dump_format='json')[i]))
mining_schema_for_1st_segment = mining_Field_For_First_Segment(feature_names)
outputField = list()
outputField.append(pml.OutputField(name="xgbValue", optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.FLOAT.value,
feature=RESULT_FEATURE.PREDICTED_VALUE.value, isFinalResult="true"))
out = pml.Output(OutputField=outputField)
oField=list()
oField.append('xgbValue')
segments_equal_to_estimators = generate_Segments_Equal_To_Estimators(get_nodes_in_json_format, derived_col_names,
feature_names)
First_segment = add_segmentation(model,segments_equal_to_estimators, mining_schema_for_1st_segment, out, 1)
reg_model=sklToPmml.get_regrs_models(model, oField, oField, target_name,mining_imp_val,categoric_values,model_name)[0]
reg_model.normalizationMethod=REGRESSION_NORMALIZATION_METHOD.LOGISTIC.value
last_segment = pml.Segment(True_=pml.True_(), id=2,
RegressionModel=reg_model)
segments.append(First_segment)
segments.append(last_segment)
else:
get_nodes_in_json_format = []
for i in range(model.n_estimators * model.n_classes_):
get_nodes_in_json_format.append(json.loads(model._Booster.get_dump(dump_format='json')[i]))
oField = list()
for index in range(0, model.n_classes_):
inner_segment = []
for in_seg in range(index, len(get_nodes_in_json_format), model.n_classes_):
inner_segment.append(get_nodes_in_json_format[in_seg])
mining_schema_for_1st_segment = mining_Field_For_First_Segment(feature_names)
outputField = list()
outputField.append(pml.OutputField(name='xgbValue(' + str(index) + ')', optype=OPTYPE.CONTINUOUS.value,
feature=RESULT_FEATURE.PREDICTED_VALUE.value, dataType=DATATYPE.FLOAT.value, isFinalResult="true"))
out = pml.Output(OutputField=outputField)
oField.append('xgbValue(' + str(index) + ')')
segments_equal_to_estimators = generate_Segments_Equal_To_Estimators(inner_segment, derived_col_names,
feature_names)
segments_equal_to_class = add_segmentation(model,segments_equal_to_estimators,
mining_schema_for_1st_segment, out, index)
segments.append(segments_equal_to_class)
reg_model=sklToPmml.get_regrs_models(model,oField,oField,target_name,mining_imp_val,categoric_values,model_name)[0]
reg_model.normalizationMethod=REGRESSION_NORMALIZATION_METHOD.SOFTMAX.value
last_segment = pml.Segment(True_=pml.True_(), id=model.n_classes_ + 1,
RegressionModel=reg_model)
segments.append(last_segment)
return segments
def get_multiple_model_method(model):
"""
It returns the type of multiple model method for MiningModels.
Parameters
----------
model :
Contains Xgboost model object
Returns
-------
The multiple model method for a MiningModel.
"""
if 'XGBClassifier' in str(model.__class__):
return MULTIPLE_MODEL_METHOD.MODEL_CHAIN.value
else:
return MULTIPLE_MODEL_METHOD.SUM.value