Credit scoring using scorecardpy with XGBoost
$begingroup$
I used XGBoost for scoring creditworthiness. At first I thought I could use predict_proba for scoring but then I saw that there was a module scorecardpy based on WOE to claculate code scoring. I tried to use it with my XGBoost like in an exemple but my ROC AUC fell to 0.5 and I don't see what I am doing wrong. Thanks for your help.
data = pd.read_csv('data.csv')
train_index = data['date'] < '2018-04-01'
test_index = data['date'] >= '2018-04-01'
data_final = data.drop('date', axis=1)
df_train = data_final[train_index]
df_test = data_final[test_index]
data_final_vars = data_final.columns.values.tolist()
y=['label']
X=[i for i in data_final_vars if i not in y]
# woe binning ------
bins = sc.woebin(data_final, y="label")
sc.woebin_plot(bins)
# binning adjustment
# # adjust breaks interactively
# breaks_adj = sc.woebin_adj(dt_s, "creditability", bins)
# # or specify breaks manually
breaks_adj = {
'age': [26, 35, 40, 50, 60]
}
bins_adj = sc.woebin(data_final, y="label", breaks_list=breaks_adj)
# converting train and test into woe values
train_woe = sc.woebin_ply(df_train, bins_adj)
test_woe = sc.woebin_ply(df_test, bins_adj)
ytrain = train_woe.loc[:,'label']
xtrain = train_woe.loc[:, train_woe.columns != 'label']
ytest = test_woe.loc[:,'label']
xtest = test_woe.loc[:, test_woe.columns != 'label']
print("shape of xtrain: {}".format(xtrain.shape))
print("shape of xtrain: {}".format(xtest.shape))
from xgboost import XGBClassifier
XGB = XGBClassifier(n_estimators=100, n_jobs=6, verbose=1)
# List the default parameters.
print(XGB.get_xgb_params())
# Train and evaluate
XGB.fit(xtrain, ytrain, eval_metric=['rmse'], eval_set=[((xtrain, ytrain)),(xtest, ytest)])
# # Classifier
from sklearn.metrics import roc_auc_score
probs = XGB.predict_proba(xtest)
roc = roc_auc_score(y_true=ytest, y_score=probs[:, 1])
print("RF roc score: {}".format(roc))
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(ytest, probs[:,1])
plt.figure()
plt.plot(fpr, tpr, label='XGBoost Classifier (area = %0.2f)' % roc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('XGB_ROC')
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = XGB
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, xtrain, ytrain, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: {}".format(results.mean()))
# score ------
card = sc.scorecard(bins_adj, XGB, xtrain.columns)
# credit score
train_score = sc.scorecard_ply(df_train, card, print_step=0)
test_score = sc.scorecard_ply(df_test, card, print_step=0)
# psi
sc.perf_psi(
score = {'train':train_score, 'test':test_score},
label = {'train':y_train, 'test':y_test}
)
machine-learning python decision-trees xgboost scoring
$endgroup$
add a comment |
$begingroup$
I used XGBoost for scoring creditworthiness. At first I thought I could use predict_proba for scoring but then I saw that there was a module scorecardpy based on WOE to claculate code scoring. I tried to use it with my XGBoost like in an exemple but my ROC AUC fell to 0.5 and I don't see what I am doing wrong. Thanks for your help.
data = pd.read_csv('data.csv')
train_index = data['date'] < '2018-04-01'
test_index = data['date'] >= '2018-04-01'
data_final = data.drop('date', axis=1)
df_train = data_final[train_index]
df_test = data_final[test_index]
data_final_vars = data_final.columns.values.tolist()
y=['label']
X=[i for i in data_final_vars if i not in y]
# woe binning ------
bins = sc.woebin(data_final, y="label")
sc.woebin_plot(bins)
# binning adjustment
# # adjust breaks interactively
# breaks_adj = sc.woebin_adj(dt_s, "creditability", bins)
# # or specify breaks manually
breaks_adj = {
'age': [26, 35, 40, 50, 60]
}
bins_adj = sc.woebin(data_final, y="label", breaks_list=breaks_adj)
# converting train and test into woe values
train_woe = sc.woebin_ply(df_train, bins_adj)
test_woe = sc.woebin_ply(df_test, bins_adj)
ytrain = train_woe.loc[:,'label']
xtrain = train_woe.loc[:, train_woe.columns != 'label']
ytest = test_woe.loc[:,'label']
xtest = test_woe.loc[:, test_woe.columns != 'label']
print("shape of xtrain: {}".format(xtrain.shape))
print("shape of xtrain: {}".format(xtest.shape))
from xgboost import XGBClassifier
XGB = XGBClassifier(n_estimators=100, n_jobs=6, verbose=1)
# List the default parameters.
print(XGB.get_xgb_params())
# Train and evaluate
XGB.fit(xtrain, ytrain, eval_metric=['rmse'], eval_set=[((xtrain, ytrain)),(xtest, ytest)])
# # Classifier
from sklearn.metrics import roc_auc_score
probs = XGB.predict_proba(xtest)
roc = roc_auc_score(y_true=ytest, y_score=probs[:, 1])
print("RF roc score: {}".format(roc))
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(ytest, probs[:,1])
plt.figure()
plt.plot(fpr, tpr, label='XGBoost Classifier (area = %0.2f)' % roc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('XGB_ROC')
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = XGB
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, xtrain, ytrain, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: {}".format(results.mean()))
# score ------
card = sc.scorecard(bins_adj, XGB, xtrain.columns)
# credit score
train_score = sc.scorecard_ply(df_train, card, print_step=0)
test_score = sc.scorecard_ply(df_test, card, print_step=0)
# psi
sc.perf_psi(
score = {'train':train_score, 'test':test_score},
label = {'train':y_train, 'test':y_test}
)
machine-learning python decision-trees xgboost scoring
$endgroup$
add a comment |
$begingroup$
I used XGBoost for scoring creditworthiness. At first I thought I could use predict_proba for scoring but then I saw that there was a module scorecardpy based on WOE to claculate code scoring. I tried to use it with my XGBoost like in an exemple but my ROC AUC fell to 0.5 and I don't see what I am doing wrong. Thanks for your help.
data = pd.read_csv('data.csv')
train_index = data['date'] < '2018-04-01'
test_index = data['date'] >= '2018-04-01'
data_final = data.drop('date', axis=1)
df_train = data_final[train_index]
df_test = data_final[test_index]
data_final_vars = data_final.columns.values.tolist()
y=['label']
X=[i for i in data_final_vars if i not in y]
# woe binning ------
bins = sc.woebin(data_final, y="label")
sc.woebin_plot(bins)
# binning adjustment
# # adjust breaks interactively
# breaks_adj = sc.woebin_adj(dt_s, "creditability", bins)
# # or specify breaks manually
breaks_adj = {
'age': [26, 35, 40, 50, 60]
}
bins_adj = sc.woebin(data_final, y="label", breaks_list=breaks_adj)
# converting train and test into woe values
train_woe = sc.woebin_ply(df_train, bins_adj)
test_woe = sc.woebin_ply(df_test, bins_adj)
ytrain = train_woe.loc[:,'label']
xtrain = train_woe.loc[:, train_woe.columns != 'label']
ytest = test_woe.loc[:,'label']
xtest = test_woe.loc[:, test_woe.columns != 'label']
print("shape of xtrain: {}".format(xtrain.shape))
print("shape of xtrain: {}".format(xtest.shape))
from xgboost import XGBClassifier
XGB = XGBClassifier(n_estimators=100, n_jobs=6, verbose=1)
# List the default parameters.
print(XGB.get_xgb_params())
# Train and evaluate
XGB.fit(xtrain, ytrain, eval_metric=['rmse'], eval_set=[((xtrain, ytrain)),(xtest, ytest)])
# # Classifier
from sklearn.metrics import roc_auc_score
probs = XGB.predict_proba(xtest)
roc = roc_auc_score(y_true=ytest, y_score=probs[:, 1])
print("RF roc score: {}".format(roc))
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(ytest, probs[:,1])
plt.figure()
plt.plot(fpr, tpr, label='XGBoost Classifier (area = %0.2f)' % roc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('XGB_ROC')
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = XGB
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, xtrain, ytrain, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: {}".format(results.mean()))
# score ------
card = sc.scorecard(bins_adj, XGB, xtrain.columns)
# credit score
train_score = sc.scorecard_ply(df_train, card, print_step=0)
test_score = sc.scorecard_ply(df_test, card, print_step=0)
# psi
sc.perf_psi(
score = {'train':train_score, 'test':test_score},
label = {'train':y_train, 'test':y_test}
)
machine-learning python decision-trees xgboost scoring
$endgroup$
I used XGBoost for scoring creditworthiness. At first I thought I could use predict_proba for scoring but then I saw that there was a module scorecardpy based on WOE to claculate code scoring. I tried to use it with my XGBoost like in an exemple but my ROC AUC fell to 0.5 and I don't see what I am doing wrong. Thanks for your help.
data = pd.read_csv('data.csv')
train_index = data['date'] < '2018-04-01'
test_index = data['date'] >= '2018-04-01'
data_final = data.drop('date', axis=1)
df_train = data_final[train_index]
df_test = data_final[test_index]
data_final_vars = data_final.columns.values.tolist()
y=['label']
X=[i for i in data_final_vars if i not in y]
# woe binning ------
bins = sc.woebin(data_final, y="label")
sc.woebin_plot(bins)
# binning adjustment
# # adjust breaks interactively
# breaks_adj = sc.woebin_adj(dt_s, "creditability", bins)
# # or specify breaks manually
breaks_adj = {
'age': [26, 35, 40, 50, 60]
}
bins_adj = sc.woebin(data_final, y="label", breaks_list=breaks_adj)
# converting train and test into woe values
train_woe = sc.woebin_ply(df_train, bins_adj)
test_woe = sc.woebin_ply(df_test, bins_adj)
ytrain = train_woe.loc[:,'label']
xtrain = train_woe.loc[:, train_woe.columns != 'label']
ytest = test_woe.loc[:,'label']
xtest = test_woe.loc[:, test_woe.columns != 'label']
print("shape of xtrain: {}".format(xtrain.shape))
print("shape of xtrain: {}".format(xtest.shape))
from xgboost import XGBClassifier
XGB = XGBClassifier(n_estimators=100, n_jobs=6, verbose=1)
# List the default parameters.
print(XGB.get_xgb_params())
# Train and evaluate
XGB.fit(xtrain, ytrain, eval_metric=['rmse'], eval_set=[((xtrain, ytrain)),(xtest, ytest)])
# # Classifier
from sklearn.metrics import roc_auc_score
probs = XGB.predict_proba(xtest)
roc = roc_auc_score(y_true=ytest, y_score=probs[:, 1])
print("RF roc score: {}".format(roc))
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(ytest, probs[:,1])
plt.figure()
plt.plot(fpr, tpr, label='XGBoost Classifier (area = %0.2f)' % roc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('XGB_ROC')
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = XGB
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, xtrain, ytrain, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: {}".format(results.mean()))
# score ------
card = sc.scorecard(bins_adj, XGB, xtrain.columns)
# credit score
train_score = sc.scorecard_ply(df_train, card, print_step=0)
test_score = sc.scorecard_ply(df_test, card, print_step=0)
# psi
sc.perf_psi(
score = {'train':train_score, 'test':test_score},
label = {'train':y_train, 'test':y_test}
)
machine-learning python decision-trees xgboost scoring
machine-learning python decision-trees xgboost scoring
edited Sep 26 '18 at 14:55
Minila S
asked Sep 26 '18 at 14:06
Minila SMinila S
162
162
add a comment |
add a comment |
1 Answer
1
active
oldest
votes
$begingroup$
It happened to me as well, although I used a logistic regression model not XGBoost.
The problem is not about which model to choose, but rather there is something wrong with "woebin_ply" function. I didn't read the source code but the woe value I'm getting doesn't match the value for the corresponding bin/input value. (You can double check your results as well)
After manually matching input value with bin with corresponding woe value, my scorecard model performs at similar level with my benchmarking models.
Hope this help!
New contributor
$endgroup$
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["$", "$"], ["\\(","\\)"]]);
});
});
}, "mathjax-editing");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "557"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdatascience.stackexchange.com%2fquestions%2f38817%2fcredit-scoring-using-scorecardpy-with-xgboost%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
$begingroup$
It happened to me as well, although I used a logistic regression model not XGBoost.
The problem is not about which model to choose, but rather there is something wrong with "woebin_ply" function. I didn't read the source code but the woe value I'm getting doesn't match the value for the corresponding bin/input value. (You can double check your results as well)
After manually matching input value with bin with corresponding woe value, my scorecard model performs at similar level with my benchmarking models.
Hope this help!
New contributor
$endgroup$
add a comment |
$begingroup$
It happened to me as well, although I used a logistic regression model not XGBoost.
The problem is not about which model to choose, but rather there is something wrong with "woebin_ply" function. I didn't read the source code but the woe value I'm getting doesn't match the value for the corresponding bin/input value. (You can double check your results as well)
After manually matching input value with bin with corresponding woe value, my scorecard model performs at similar level with my benchmarking models.
Hope this help!
New contributor
$endgroup$
add a comment |
$begingroup$
It happened to me as well, although I used a logistic regression model not XGBoost.
The problem is not about which model to choose, but rather there is something wrong with "woebin_ply" function. I didn't read the source code but the woe value I'm getting doesn't match the value for the corresponding bin/input value. (You can double check your results as well)
After manually matching input value with bin with corresponding woe value, my scorecard model performs at similar level with my benchmarking models.
Hope this help!
New contributor
$endgroup$
It happened to me as well, although I used a logistic regression model not XGBoost.
The problem is not about which model to choose, but rather there is something wrong with "woebin_ply" function. I didn't read the source code but the woe value I'm getting doesn't match the value for the corresponding bin/input value. (You can double check your results as well)
After manually matching input value with bin with corresponding woe value, my scorecard model performs at similar level with my benchmarking models.
Hope this help!
New contributor
New contributor
answered 4 hours ago
lsbillupslsbillups
1
1
New contributor
New contributor
add a comment |
add a comment |
Thanks for contributing an answer to Data Science Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdatascience.stackexchange.com%2fquestions%2f38817%2fcredit-scoring-using-scorecardpy-with-xgboost%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown