Credit scoring using scorecardpy with XGBoost












2












$begingroup$


I used XGBoost for scoring creditworthiness. At first I thought I could use predict_proba for scoring but then I saw that there was a module scorecardpy based on WOE to claculate code scoring. I tried to use it with my XGBoost like in an exemple but my ROC AUC fell to 0.5 and I don't see what I am doing wrong. Thanks for your help.



data = pd.read_csv('data.csv')


train_index = data['date'] < '2018-04-01'
test_index = data['date'] >= '2018-04-01'

data_final = data.drop('date', axis=1)

df_train = data_final[train_index]
df_test = data_final[test_index]

data_final_vars = data_final.columns.values.tolist()
y=['label']
X=[i for i in data_final_vars if i not in y]


# woe binning ------
bins = sc.woebin(data_final, y="label")
sc.woebin_plot(bins)

# binning adjustment
# # adjust breaks interactively
# breaks_adj = sc.woebin_adj(dt_s, "creditability", bins)
# # or specify breaks manually
breaks_adj = {
'age': [26, 35, 40, 50, 60]
}
bins_adj = sc.woebin(data_final, y="label", breaks_list=breaks_adj)

# converting train and test into woe values
train_woe = sc.woebin_ply(df_train, bins_adj)
test_woe = sc.woebin_ply(df_test, bins_adj)


ytrain = train_woe.loc[:,'label']
xtrain = train_woe.loc[:, train_woe.columns != 'label']
ytest = test_woe.loc[:,'label']
xtest = test_woe.loc[:, test_woe.columns != 'label']

print("shape of xtrain: {}".format(xtrain.shape))
print("shape of xtrain: {}".format(xtest.shape))

from xgboost import XGBClassifier

XGB = XGBClassifier(n_estimators=100, n_jobs=6, verbose=1)
# List the default parameters.
print(XGB.get_xgb_params())

# Train and evaluate
XGB.fit(xtrain, ytrain, eval_metric=['rmse'], eval_set=[((xtrain, ytrain)),(xtest, ytest)])


# # Classifier

from sklearn.metrics import roc_auc_score

probs = XGB.predict_proba(xtest)
roc = roc_auc_score(y_true=ytest, y_score=probs[:, 1])
print("RF roc score: {}".format(roc))


from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(ytest, probs[:,1])
plt.figure()
plt.plot(fpr, tpr, label='XGBoost Classifier (area = %0.2f)' % roc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('XGB_ROC')


from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = XGB
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, xtrain, ytrain, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: {}".format(results.mean()))


# score ------
card = sc.scorecard(bins_adj, XGB, xtrain.columns)
# credit score
train_score = sc.scorecard_ply(df_train, card, print_step=0)
test_score = sc.scorecard_ply(df_test, card, print_step=0)

# psi
sc.perf_psi(
score = {'train':train_score, 'test':test_score},
label = {'train':y_train, 'test':y_test}
)









share|improve this question











$endgroup$

















    2












    $begingroup$


    I used XGBoost for scoring creditworthiness. At first I thought I could use predict_proba for scoring but then I saw that there was a module scorecardpy based on WOE to claculate code scoring. I tried to use it with my XGBoost like in an exemple but my ROC AUC fell to 0.5 and I don't see what I am doing wrong. Thanks for your help.



    data = pd.read_csv('data.csv')


    train_index = data['date'] < '2018-04-01'
    test_index = data['date'] >= '2018-04-01'

    data_final = data.drop('date', axis=1)

    df_train = data_final[train_index]
    df_test = data_final[test_index]

    data_final_vars = data_final.columns.values.tolist()
    y=['label']
    X=[i for i in data_final_vars if i not in y]


    # woe binning ------
    bins = sc.woebin(data_final, y="label")
    sc.woebin_plot(bins)

    # binning adjustment
    # # adjust breaks interactively
    # breaks_adj = sc.woebin_adj(dt_s, "creditability", bins)
    # # or specify breaks manually
    breaks_adj = {
    'age': [26, 35, 40, 50, 60]
    }
    bins_adj = sc.woebin(data_final, y="label", breaks_list=breaks_adj)

    # converting train and test into woe values
    train_woe = sc.woebin_ply(df_train, bins_adj)
    test_woe = sc.woebin_ply(df_test, bins_adj)


    ytrain = train_woe.loc[:,'label']
    xtrain = train_woe.loc[:, train_woe.columns != 'label']
    ytest = test_woe.loc[:,'label']
    xtest = test_woe.loc[:, test_woe.columns != 'label']

    print("shape of xtrain: {}".format(xtrain.shape))
    print("shape of xtrain: {}".format(xtest.shape))

    from xgboost import XGBClassifier

    XGB = XGBClassifier(n_estimators=100, n_jobs=6, verbose=1)
    # List the default parameters.
    print(XGB.get_xgb_params())

    # Train and evaluate
    XGB.fit(xtrain, ytrain, eval_metric=['rmse'], eval_set=[((xtrain, ytrain)),(xtest, ytest)])


    # # Classifier

    from sklearn.metrics import roc_auc_score

    probs = XGB.predict_proba(xtest)
    roc = roc_auc_score(y_true=ytest, y_score=probs[:, 1])
    print("RF roc score: {}".format(roc))


    from sklearn.metrics import roc_curve
    fpr, tpr, thresholds = roc_curve(ytest, probs[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='XGBoost Classifier (area = %0.2f)' % roc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('XGB_ROC')


    from sklearn import model_selection
    from sklearn.model_selection import cross_val_score
    kfold = model_selection.KFold(n_splits=10, random_state=7)
    modelCV = XGB
    scoring = 'accuracy'
    results = model_selection.cross_val_score(modelCV, xtrain, ytrain, cv=kfold, scoring=scoring)
    print("10-fold cross validation average accuracy: {}".format(results.mean()))


    # score ------
    card = sc.scorecard(bins_adj, XGB, xtrain.columns)
    # credit score
    train_score = sc.scorecard_ply(df_train, card, print_step=0)
    test_score = sc.scorecard_ply(df_test, card, print_step=0)

    # psi
    sc.perf_psi(
    score = {'train':train_score, 'test':test_score},
    label = {'train':y_train, 'test':y_test}
    )









    share|improve this question











    $endgroup$















      2












      2








      2





      $begingroup$


      I used XGBoost for scoring creditworthiness. At first I thought I could use predict_proba for scoring but then I saw that there was a module scorecardpy based on WOE to claculate code scoring. I tried to use it with my XGBoost like in an exemple but my ROC AUC fell to 0.5 and I don't see what I am doing wrong. Thanks for your help.



      data = pd.read_csv('data.csv')


      train_index = data['date'] < '2018-04-01'
      test_index = data['date'] >= '2018-04-01'

      data_final = data.drop('date', axis=1)

      df_train = data_final[train_index]
      df_test = data_final[test_index]

      data_final_vars = data_final.columns.values.tolist()
      y=['label']
      X=[i for i in data_final_vars if i not in y]


      # woe binning ------
      bins = sc.woebin(data_final, y="label")
      sc.woebin_plot(bins)

      # binning adjustment
      # # adjust breaks interactively
      # breaks_adj = sc.woebin_adj(dt_s, "creditability", bins)
      # # or specify breaks manually
      breaks_adj = {
      'age': [26, 35, 40, 50, 60]
      }
      bins_adj = sc.woebin(data_final, y="label", breaks_list=breaks_adj)

      # converting train and test into woe values
      train_woe = sc.woebin_ply(df_train, bins_adj)
      test_woe = sc.woebin_ply(df_test, bins_adj)


      ytrain = train_woe.loc[:,'label']
      xtrain = train_woe.loc[:, train_woe.columns != 'label']
      ytest = test_woe.loc[:,'label']
      xtest = test_woe.loc[:, test_woe.columns != 'label']

      print("shape of xtrain: {}".format(xtrain.shape))
      print("shape of xtrain: {}".format(xtest.shape))

      from xgboost import XGBClassifier

      XGB = XGBClassifier(n_estimators=100, n_jobs=6, verbose=1)
      # List the default parameters.
      print(XGB.get_xgb_params())

      # Train and evaluate
      XGB.fit(xtrain, ytrain, eval_metric=['rmse'], eval_set=[((xtrain, ytrain)),(xtest, ytest)])


      # # Classifier

      from sklearn.metrics import roc_auc_score

      probs = XGB.predict_proba(xtest)
      roc = roc_auc_score(y_true=ytest, y_score=probs[:, 1])
      print("RF roc score: {}".format(roc))


      from sklearn.metrics import roc_curve
      fpr, tpr, thresholds = roc_curve(ytest, probs[:,1])
      plt.figure()
      plt.plot(fpr, tpr, label='XGBoost Classifier (area = %0.2f)' % roc)
      plt.plot([0, 1], [0, 1],'r--')
      plt.xlim([0.0, 1.0])
      plt.ylim([0.0, 1.05])
      plt.xlabel('False Positive Rate')
      plt.ylabel('True Positive Rate')
      plt.title('Receiver operating characteristic')
      plt.legend(loc="lower right")
      plt.savefig('XGB_ROC')


      from sklearn import model_selection
      from sklearn.model_selection import cross_val_score
      kfold = model_selection.KFold(n_splits=10, random_state=7)
      modelCV = XGB
      scoring = 'accuracy'
      results = model_selection.cross_val_score(modelCV, xtrain, ytrain, cv=kfold, scoring=scoring)
      print("10-fold cross validation average accuracy: {}".format(results.mean()))


      # score ------
      card = sc.scorecard(bins_adj, XGB, xtrain.columns)
      # credit score
      train_score = sc.scorecard_ply(df_train, card, print_step=0)
      test_score = sc.scorecard_ply(df_test, card, print_step=0)

      # psi
      sc.perf_psi(
      score = {'train':train_score, 'test':test_score},
      label = {'train':y_train, 'test':y_test}
      )









      share|improve this question











      $endgroup$




      I used XGBoost for scoring creditworthiness. At first I thought I could use predict_proba for scoring but then I saw that there was a module scorecardpy based on WOE to claculate code scoring. I tried to use it with my XGBoost like in an exemple but my ROC AUC fell to 0.5 and I don't see what I am doing wrong. Thanks for your help.



      data = pd.read_csv('data.csv')


      train_index = data['date'] < '2018-04-01'
      test_index = data['date'] >= '2018-04-01'

      data_final = data.drop('date', axis=1)

      df_train = data_final[train_index]
      df_test = data_final[test_index]

      data_final_vars = data_final.columns.values.tolist()
      y=['label']
      X=[i for i in data_final_vars if i not in y]


      # woe binning ------
      bins = sc.woebin(data_final, y="label")
      sc.woebin_plot(bins)

      # binning adjustment
      # # adjust breaks interactively
      # breaks_adj = sc.woebin_adj(dt_s, "creditability", bins)
      # # or specify breaks manually
      breaks_adj = {
      'age': [26, 35, 40, 50, 60]
      }
      bins_adj = sc.woebin(data_final, y="label", breaks_list=breaks_adj)

      # converting train and test into woe values
      train_woe = sc.woebin_ply(df_train, bins_adj)
      test_woe = sc.woebin_ply(df_test, bins_adj)


      ytrain = train_woe.loc[:,'label']
      xtrain = train_woe.loc[:, train_woe.columns != 'label']
      ytest = test_woe.loc[:,'label']
      xtest = test_woe.loc[:, test_woe.columns != 'label']

      print("shape of xtrain: {}".format(xtrain.shape))
      print("shape of xtrain: {}".format(xtest.shape))

      from xgboost import XGBClassifier

      XGB = XGBClassifier(n_estimators=100, n_jobs=6, verbose=1)
      # List the default parameters.
      print(XGB.get_xgb_params())

      # Train and evaluate
      XGB.fit(xtrain, ytrain, eval_metric=['rmse'], eval_set=[((xtrain, ytrain)),(xtest, ytest)])


      # # Classifier

      from sklearn.metrics import roc_auc_score

      probs = XGB.predict_proba(xtest)
      roc = roc_auc_score(y_true=ytest, y_score=probs[:, 1])
      print("RF roc score: {}".format(roc))


      from sklearn.metrics import roc_curve
      fpr, tpr, thresholds = roc_curve(ytest, probs[:,1])
      plt.figure()
      plt.plot(fpr, tpr, label='XGBoost Classifier (area = %0.2f)' % roc)
      plt.plot([0, 1], [0, 1],'r--')
      plt.xlim([0.0, 1.0])
      plt.ylim([0.0, 1.05])
      plt.xlabel('False Positive Rate')
      plt.ylabel('True Positive Rate')
      plt.title('Receiver operating characteristic')
      plt.legend(loc="lower right")
      plt.savefig('XGB_ROC')


      from sklearn import model_selection
      from sklearn.model_selection import cross_val_score
      kfold = model_selection.KFold(n_splits=10, random_state=7)
      modelCV = XGB
      scoring = 'accuracy'
      results = model_selection.cross_val_score(modelCV, xtrain, ytrain, cv=kfold, scoring=scoring)
      print("10-fold cross validation average accuracy: {}".format(results.mean()))


      # score ------
      card = sc.scorecard(bins_adj, XGB, xtrain.columns)
      # credit score
      train_score = sc.scorecard_ply(df_train, card, print_step=0)
      test_score = sc.scorecard_ply(df_test, card, print_step=0)

      # psi
      sc.perf_psi(
      score = {'train':train_score, 'test':test_score},
      label = {'train':y_train, 'test':y_test}
      )






      machine-learning python decision-trees xgboost scoring






      share|improve this question















      share|improve this question













      share|improve this question




      share|improve this question








      edited Sep 26 '18 at 14:55







      Minila S

















      asked Sep 26 '18 at 14:06









      Minila SMinila S

      162




      162






















          1 Answer
          1






          active

          oldest

          votes


















          0












          $begingroup$

          It happened to me as well, although I used a logistic regression model not XGBoost.



          The problem is not about which model to choose, but rather there is something wrong with "woebin_ply" function. I didn't read the source code but the woe value I'm getting doesn't match the value for the corresponding bin/input value. (You can double check your results as well)



          After manually matching input value with bin with corresponding woe value, my scorecard model performs at similar level with my benchmarking models.



          Hope this help!






          share|improve this answer








          New contributor




          lsbillups is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
          Check out our Code of Conduct.






          $endgroup$













            Your Answer





            StackExchange.ifUsing("editor", function () {
            return StackExchange.using("mathjaxEditing", function () {
            StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
            StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["$", "$"], ["\\(","\\)"]]);
            });
            });
            }, "mathjax-editing");

            StackExchange.ready(function() {
            var channelOptions = {
            tags: "".split(" "),
            id: "557"
            };
            initTagRenderer("".split(" "), "".split(" "), channelOptions);

            StackExchange.using("externalEditor", function() {
            // Have to fire editor after snippets, if snippets enabled
            if (StackExchange.settings.snippets.snippetsEnabled) {
            StackExchange.using("snippets", function() {
            createEditor();
            });
            }
            else {
            createEditor();
            }
            });

            function createEditor() {
            StackExchange.prepareEditor({
            heartbeatType: 'answer',
            autoActivateHeartbeat: false,
            convertImagesToLinks: false,
            noModals: true,
            showLowRepImageUploadWarning: true,
            reputationToPostImages: null,
            bindNavPrevention: true,
            postfix: "",
            imageUploader: {
            brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
            contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
            allowUrls: true
            },
            onDemand: true,
            discardSelector: ".discard-answer"
            ,immediatelyShowMarkdownHelp:true
            });


            }
            });














            draft saved

            draft discarded


















            StackExchange.ready(
            function () {
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdatascience.stackexchange.com%2fquestions%2f38817%2fcredit-scoring-using-scorecardpy-with-xgboost%23new-answer', 'question_page');
            }
            );

            Post as a guest















            Required, but never shown

























            1 Answer
            1






            active

            oldest

            votes








            1 Answer
            1






            active

            oldest

            votes









            active

            oldest

            votes






            active

            oldest

            votes









            0












            $begingroup$

            It happened to me as well, although I used a logistic regression model not XGBoost.



            The problem is not about which model to choose, but rather there is something wrong with "woebin_ply" function. I didn't read the source code but the woe value I'm getting doesn't match the value for the corresponding bin/input value. (You can double check your results as well)



            After manually matching input value with bin with corresponding woe value, my scorecard model performs at similar level with my benchmarking models.



            Hope this help!






            share|improve this answer








            New contributor




            lsbillups is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
            Check out our Code of Conduct.






            $endgroup$


















              0












              $begingroup$

              It happened to me as well, although I used a logistic regression model not XGBoost.



              The problem is not about which model to choose, but rather there is something wrong with "woebin_ply" function. I didn't read the source code but the woe value I'm getting doesn't match the value for the corresponding bin/input value. (You can double check your results as well)



              After manually matching input value with bin with corresponding woe value, my scorecard model performs at similar level with my benchmarking models.



              Hope this help!






              share|improve this answer








              New contributor




              lsbillups is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
              Check out our Code of Conduct.






              $endgroup$
















                0












                0








                0





                $begingroup$

                It happened to me as well, although I used a logistic regression model not XGBoost.



                The problem is not about which model to choose, but rather there is something wrong with "woebin_ply" function. I didn't read the source code but the woe value I'm getting doesn't match the value for the corresponding bin/input value. (You can double check your results as well)



                After manually matching input value with bin with corresponding woe value, my scorecard model performs at similar level with my benchmarking models.



                Hope this help!






                share|improve this answer








                New contributor




                lsbillups is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
                Check out our Code of Conduct.






                $endgroup$



                It happened to me as well, although I used a logistic regression model not XGBoost.



                The problem is not about which model to choose, but rather there is something wrong with "woebin_ply" function. I didn't read the source code but the woe value I'm getting doesn't match the value for the corresponding bin/input value. (You can double check your results as well)



                After manually matching input value with bin with corresponding woe value, my scorecard model performs at similar level with my benchmarking models.



                Hope this help!







                share|improve this answer








                New contributor




                lsbillups is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
                Check out our Code of Conduct.









                share|improve this answer



                share|improve this answer






                New contributor




                lsbillups is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
                Check out our Code of Conduct.









                answered 4 hours ago









                lsbillupslsbillups

                1




                1




                New contributor




                lsbillups is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
                Check out our Code of Conduct.





                New contributor





                lsbillups is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
                Check out our Code of Conduct.






                lsbillups is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
                Check out our Code of Conduct.






























                    draft saved

                    draft discarded




















































                    Thanks for contributing an answer to Data Science Stack Exchange!


                    • Please be sure to answer the question. Provide details and share your research!

                    But avoid



                    • Asking for help, clarification, or responding to other answers.

                    • Making statements based on opinion; back them up with references or personal experience.


                    Use MathJax to format equations. MathJax reference.


                    To learn more, see our tips on writing great answers.




                    draft saved


                    draft discarded














                    StackExchange.ready(
                    function () {
                    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdatascience.stackexchange.com%2fquestions%2f38817%2fcredit-scoring-using-scorecardpy-with-xgboost%23new-answer', 'question_page');
                    }
                    );

                    Post as a guest















                    Required, but never shown





















































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown

































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown







                    Popular posts from this blog

                    Ponta tanko

                    Tantalo (mitologio)

                    Erzsébet Schaár