Regression code with a very high train score and low test score, how can i enhance my code?












0












$begingroup$


I am performing regression analysis on some data. I keep getting very high training score and low test score. My code is below, what can i do to enhance it? Thank you in advance.



# coding: utf-8

# In[1]:

#Importing modules
import sys
import math
import itertools
import numpy as np
import pandas as pd
from numpy import genfromtxt
from matplotlib import style
import matplotlib.pyplot as plt
from sklearn import linear_model
from matplotlib import style, figure
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split


# In[2]:


#Importing data
df = np.genfromtxt('/Users/Studies/Machine_learning/reactivity/main_us.csv', delimiter=',')
#To skip the header ad skiprpws=0


# In[3]:


X = df[0:,1:306]
y = df[0:,0]


# In[4]:


print (X).shape
print (y).shape
display (X)
display (y)
print (y)


# In[5]:


X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=4)


# In[6]:


#Apply StandardScaler for feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)
print len(X_test), len(y_test)


# In[7]:


#Applying PCA for dimnetionality reduction

from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

#Checking shape after scaling
print ("Checking shape after scaling")
print (X_train.shape)
print (X_test.shape)


#Variance/Values
print("Explained_variance_ratio")
print(pca.explained_variance_ratio_)
print("Singular_values")
print(pca.singular_values_)


#Plotting
print ("Graph")
plt.scatter (X_train[:,0], X_train[:,1], c=y_train, edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('rainbow',6))
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.colorbar();

print ('You are looking at a high dimentional data explained by 2 components')
print ('Eeven though these components hold some information, but this to seperate the components apart')


print(pca.explained_variance_ratio_)
print(pca.singular_values_)

#Checking shape after scaling
print (X_train.shape)
print (y_train.shape)
print (X_train.shape)


# In[8]:


alphas = 10**np.linspace(10,-2,100)*0.5
alphas


# In[9]:


from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge, Lasso

for Model in [Ridge, Lasso]:
model = Model()
print('%s: %s' % (Model.__name__,
cross_val_score(model, X, y).mean()))

# Out[9]:

Ridge: -1.3841312374053019
Lasso: -1.164517926682712

# In[10]:


import numpy as np
from matplotlib import pyplot as plt

alphas = np.logspace(-3, -1, 30)

plt.figure(figsize=(5, 3))

for Model in [Lasso, Ridge]:
scores = [cross_val_score(Model(alpha), X, y, cv=3).mean()
for alpha in alphas]
plt.plot(alphas, scores, label=Model.__name__)

plt.legend(loc='lower left')
plt.xlabel('alpha')
plt.ylabel('cross validation score')
plt.tight_layout()
plt.show()


# In[11]:


# alpha = 0.1
model = Ridge(alpha = 0.1)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)

# alpha = 0.01
model1 = Ridge(alpha = 0.01)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)

# alpha = 0.001
model2 = Ridge(alpha = 0.001)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)

# alpha = 0.0001
model3 = Ridge(alpha = 0.0001)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)

# Out[11]:

0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558


# In[12]:


modelCV = RidgeCV(alphas = [0.1, 0.01, 0.001,0.0001], store_cv_values = True)
modelCV.fit(X_train,y_train)
modelCV.alpha_ #giving 0.1
print modelCV.score(X_train,y_train) # giving 0.36898424479812919 which is the same score as ridge regression with alpha = 0.1
print modelCV.score(X_test,y_test)

# Out[12]:

0.9999996833724951
-0.41203227638984496









share|improve this question









$endgroup$

















    0












    $begingroup$


    I am performing regression analysis on some data. I keep getting very high training score and low test score. My code is below, what can i do to enhance it? Thank you in advance.



    # coding: utf-8

    # In[1]:

    #Importing modules
    import sys
    import math
    import itertools
    import numpy as np
    import pandas as pd
    from numpy import genfromtxt
    from matplotlib import style
    import matplotlib.pyplot as plt
    from sklearn import linear_model
    from matplotlib import style, figure
    from sklearn.linear_model import LassoCV
    from sklearn.linear_model import RidgeCV
    from sklearn.linear_model import LinearRegression
    from sklearn.cross_validation import train_test_split


    # In[2]:


    #Importing data
    df = np.genfromtxt('/Users/Studies/Machine_learning/reactivity/main_us.csv', delimiter=',')
    #To skip the header ad skiprpws=0


    # In[3]:


    X = df[0:,1:306]
    y = df[0:,0]


    # In[4]:


    print (X).shape
    print (y).shape
    display (X)
    display (y)
    print (y)


    # In[5]:


    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=4)


    # In[6]:


    #Apply StandardScaler for feature scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform (X_test)
    print len(X_test), len(y_test)


    # In[7]:


    #Applying PCA for dimnetionality reduction

    from sklearn.decomposition import PCA
    pca = PCA()
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    #Checking shape after scaling
    print ("Checking shape after scaling")
    print (X_train.shape)
    print (X_test.shape)


    #Variance/Values
    print("Explained_variance_ratio")
    print(pca.explained_variance_ratio_)
    print("Singular_values")
    print(pca.singular_values_)


    #Plotting
    print ("Graph")
    plt.scatter (X_train[:,0], X_train[:,1], c=y_train, edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('rainbow',6))
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.colorbar();

    print ('You are looking at a high dimentional data explained by 2 components')
    print ('Eeven though these components hold some information, but this to seperate the components apart')


    print(pca.explained_variance_ratio_)
    print(pca.singular_values_)

    #Checking shape after scaling
    print (X_train.shape)
    print (y_train.shape)
    print (X_train.shape)


    # In[8]:


    alphas = 10**np.linspace(10,-2,100)*0.5
    alphas


    # In[9]:


    from sklearn.model_selection import cross_val_score
    from sklearn.linear_model import Ridge, Lasso

    for Model in [Ridge, Lasso]:
    model = Model()
    print('%s: %s' % (Model.__name__,
    cross_val_score(model, X, y).mean()))

    # Out[9]:

    Ridge: -1.3841312374053019
    Lasso: -1.164517926682712

    # In[10]:


    import numpy as np
    from matplotlib import pyplot as plt

    alphas = np.logspace(-3, -1, 30)

    plt.figure(figsize=(5, 3))

    for Model in [Lasso, Ridge]:
    scores = [cross_val_score(Model(alpha), X, y, cv=3).mean()
    for alpha in alphas]
    plt.plot(alphas, scores, label=Model.__name__)

    plt.legend(loc='lower left')
    plt.xlabel('alpha')
    plt.ylabel('cross validation score')
    plt.tight_layout()
    plt.show()


    # In[11]:


    # alpha = 0.1
    model = Ridge(alpha = 0.1)
    model.fit(X_train,y_train)
    print model.score(X_train,y_train)
    print model.score(X_test,y_test)

    # alpha = 0.01
    model1 = Ridge(alpha = 0.01)
    model.fit(X_train,y_train)
    print model.score(X_train,y_train)
    print model.score(X_test,y_test)

    # alpha = 0.001
    model2 = Ridge(alpha = 0.001)
    model.fit(X_train,y_train)
    print model.score(X_train,y_train)
    print model.score(X_test,y_test)

    # alpha = 0.0001
    model3 = Ridge(alpha = 0.0001)
    model.fit(X_train,y_train)
    print model.score(X_train,y_train)
    print model.score(X_test,y_test)

    # Out[11]:

    0.9999996833724945
    -0.4120322763917558
    0.9999996833724945
    -0.4120322763917558
    0.9999996833724945
    -0.4120322763917558
    0.9999996833724945
    -0.4120322763917558


    # In[12]:


    modelCV = RidgeCV(alphas = [0.1, 0.01, 0.001,0.0001], store_cv_values = True)
    modelCV.fit(X_train,y_train)
    modelCV.alpha_ #giving 0.1
    print modelCV.score(X_train,y_train) # giving 0.36898424479812919 which is the same score as ridge regression with alpha = 0.1
    print modelCV.score(X_test,y_test)

    # Out[12]:

    0.9999996833724951
    -0.41203227638984496









    share|improve this question









    $endgroup$















      0












      0








      0





      $begingroup$


      I am performing regression analysis on some data. I keep getting very high training score and low test score. My code is below, what can i do to enhance it? Thank you in advance.



      # coding: utf-8

      # In[1]:

      #Importing modules
      import sys
      import math
      import itertools
      import numpy as np
      import pandas as pd
      from numpy import genfromtxt
      from matplotlib import style
      import matplotlib.pyplot as plt
      from sklearn import linear_model
      from matplotlib import style, figure
      from sklearn.linear_model import LassoCV
      from sklearn.linear_model import RidgeCV
      from sklearn.linear_model import LinearRegression
      from sklearn.cross_validation import train_test_split


      # In[2]:


      #Importing data
      df = np.genfromtxt('/Users/Studies/Machine_learning/reactivity/main_us.csv', delimiter=',')
      #To skip the header ad skiprpws=0


      # In[3]:


      X = df[0:,1:306]
      y = df[0:,0]


      # In[4]:


      print (X).shape
      print (y).shape
      display (X)
      display (y)
      print (y)


      # In[5]:


      X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=4)


      # In[6]:


      #Apply StandardScaler for feature scaling
      from sklearn.preprocessing import StandardScaler
      sc = StandardScaler()
      X_train = sc.fit_transform(X_train)
      X_test = sc.transform (X_test)
      print len(X_test), len(y_test)


      # In[7]:


      #Applying PCA for dimnetionality reduction

      from sklearn.decomposition import PCA
      pca = PCA()
      X_train = pca.fit_transform(X_train)
      X_test = pca.transform(X_test)

      #Checking shape after scaling
      print ("Checking shape after scaling")
      print (X_train.shape)
      print (X_test.shape)


      #Variance/Values
      print("Explained_variance_ratio")
      print(pca.explained_variance_ratio_)
      print("Singular_values")
      print(pca.singular_values_)


      #Plotting
      print ("Graph")
      plt.scatter (X_train[:,0], X_train[:,1], c=y_train, edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('rainbow',6))
      plt.xlabel('Component 1')
      plt.ylabel('Component 2')
      plt.colorbar();

      print ('You are looking at a high dimentional data explained by 2 components')
      print ('Eeven though these components hold some information, but this to seperate the components apart')


      print(pca.explained_variance_ratio_)
      print(pca.singular_values_)

      #Checking shape after scaling
      print (X_train.shape)
      print (y_train.shape)
      print (X_train.shape)


      # In[8]:


      alphas = 10**np.linspace(10,-2,100)*0.5
      alphas


      # In[9]:


      from sklearn.model_selection import cross_val_score
      from sklearn.linear_model import Ridge, Lasso

      for Model in [Ridge, Lasso]:
      model = Model()
      print('%s: %s' % (Model.__name__,
      cross_val_score(model, X, y).mean()))

      # Out[9]:

      Ridge: -1.3841312374053019
      Lasso: -1.164517926682712

      # In[10]:


      import numpy as np
      from matplotlib import pyplot as plt

      alphas = np.logspace(-3, -1, 30)

      plt.figure(figsize=(5, 3))

      for Model in [Lasso, Ridge]:
      scores = [cross_val_score(Model(alpha), X, y, cv=3).mean()
      for alpha in alphas]
      plt.plot(alphas, scores, label=Model.__name__)

      plt.legend(loc='lower left')
      plt.xlabel('alpha')
      plt.ylabel('cross validation score')
      plt.tight_layout()
      plt.show()


      # In[11]:


      # alpha = 0.1
      model = Ridge(alpha = 0.1)
      model.fit(X_train,y_train)
      print model.score(X_train,y_train)
      print model.score(X_test,y_test)

      # alpha = 0.01
      model1 = Ridge(alpha = 0.01)
      model.fit(X_train,y_train)
      print model.score(X_train,y_train)
      print model.score(X_test,y_test)

      # alpha = 0.001
      model2 = Ridge(alpha = 0.001)
      model.fit(X_train,y_train)
      print model.score(X_train,y_train)
      print model.score(X_test,y_test)

      # alpha = 0.0001
      model3 = Ridge(alpha = 0.0001)
      model.fit(X_train,y_train)
      print model.score(X_train,y_train)
      print model.score(X_test,y_test)

      # Out[11]:

      0.9999996833724945
      -0.4120322763917558
      0.9999996833724945
      -0.4120322763917558
      0.9999996833724945
      -0.4120322763917558
      0.9999996833724945
      -0.4120322763917558


      # In[12]:


      modelCV = RidgeCV(alphas = [0.1, 0.01, 0.001,0.0001], store_cv_values = True)
      modelCV.fit(X_train,y_train)
      modelCV.alpha_ #giving 0.1
      print modelCV.score(X_train,y_train) # giving 0.36898424479812919 which is the same score as ridge regression with alpha = 0.1
      print modelCV.score(X_test,y_test)

      # Out[12]:

      0.9999996833724951
      -0.41203227638984496









      share|improve this question









      $endgroup$




      I am performing regression analysis on some data. I keep getting very high training score and low test score. My code is below, what can i do to enhance it? Thank you in advance.



      # coding: utf-8

      # In[1]:

      #Importing modules
      import sys
      import math
      import itertools
      import numpy as np
      import pandas as pd
      from numpy import genfromtxt
      from matplotlib import style
      import matplotlib.pyplot as plt
      from sklearn import linear_model
      from matplotlib import style, figure
      from sklearn.linear_model import LassoCV
      from sklearn.linear_model import RidgeCV
      from sklearn.linear_model import LinearRegression
      from sklearn.cross_validation import train_test_split


      # In[2]:


      #Importing data
      df = np.genfromtxt('/Users/Studies/Machine_learning/reactivity/main_us.csv', delimiter=',')
      #To skip the header ad skiprpws=0


      # In[3]:


      X = df[0:,1:306]
      y = df[0:,0]


      # In[4]:


      print (X).shape
      print (y).shape
      display (X)
      display (y)
      print (y)


      # In[5]:


      X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=4)


      # In[6]:


      #Apply StandardScaler for feature scaling
      from sklearn.preprocessing import StandardScaler
      sc = StandardScaler()
      X_train = sc.fit_transform(X_train)
      X_test = sc.transform (X_test)
      print len(X_test), len(y_test)


      # In[7]:


      #Applying PCA for dimnetionality reduction

      from sklearn.decomposition import PCA
      pca = PCA()
      X_train = pca.fit_transform(X_train)
      X_test = pca.transform(X_test)

      #Checking shape after scaling
      print ("Checking shape after scaling")
      print (X_train.shape)
      print (X_test.shape)


      #Variance/Values
      print("Explained_variance_ratio")
      print(pca.explained_variance_ratio_)
      print("Singular_values")
      print(pca.singular_values_)


      #Plotting
      print ("Graph")
      plt.scatter (X_train[:,0], X_train[:,1], c=y_train, edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('rainbow',6))
      plt.xlabel('Component 1')
      plt.ylabel('Component 2')
      plt.colorbar();

      print ('You are looking at a high dimentional data explained by 2 components')
      print ('Eeven though these components hold some information, but this to seperate the components apart')


      print(pca.explained_variance_ratio_)
      print(pca.singular_values_)

      #Checking shape after scaling
      print (X_train.shape)
      print (y_train.shape)
      print (X_train.shape)


      # In[8]:


      alphas = 10**np.linspace(10,-2,100)*0.5
      alphas


      # In[9]:


      from sklearn.model_selection import cross_val_score
      from sklearn.linear_model import Ridge, Lasso

      for Model in [Ridge, Lasso]:
      model = Model()
      print('%s: %s' % (Model.__name__,
      cross_val_score(model, X, y).mean()))

      # Out[9]:

      Ridge: -1.3841312374053019
      Lasso: -1.164517926682712

      # In[10]:


      import numpy as np
      from matplotlib import pyplot as plt

      alphas = np.logspace(-3, -1, 30)

      plt.figure(figsize=(5, 3))

      for Model in [Lasso, Ridge]:
      scores = [cross_val_score(Model(alpha), X, y, cv=3).mean()
      for alpha in alphas]
      plt.plot(alphas, scores, label=Model.__name__)

      plt.legend(loc='lower left')
      plt.xlabel('alpha')
      plt.ylabel('cross validation score')
      plt.tight_layout()
      plt.show()


      # In[11]:


      # alpha = 0.1
      model = Ridge(alpha = 0.1)
      model.fit(X_train,y_train)
      print model.score(X_train,y_train)
      print model.score(X_test,y_test)

      # alpha = 0.01
      model1 = Ridge(alpha = 0.01)
      model.fit(X_train,y_train)
      print model.score(X_train,y_train)
      print model.score(X_test,y_test)

      # alpha = 0.001
      model2 = Ridge(alpha = 0.001)
      model.fit(X_train,y_train)
      print model.score(X_train,y_train)
      print model.score(X_test,y_test)

      # alpha = 0.0001
      model3 = Ridge(alpha = 0.0001)
      model.fit(X_train,y_train)
      print model.score(X_train,y_train)
      print model.score(X_test,y_test)

      # Out[11]:

      0.9999996833724945
      -0.4120322763917558
      0.9999996833724945
      -0.4120322763917558
      0.9999996833724945
      -0.4120322763917558
      0.9999996833724945
      -0.4120322763917558


      # In[12]:


      modelCV = RidgeCV(alphas = [0.1, 0.01, 0.001,0.0001], store_cv_values = True)
      modelCV.fit(X_train,y_train)
      modelCV.alpha_ #giving 0.1
      print modelCV.score(X_train,y_train) # giving 0.36898424479812919 which is the same score as ridge regression with alpha = 0.1
      print modelCV.score(X_test,y_test)

      # Out[12]:

      0.9999996833724951
      -0.41203227638984496






      machine-learning scikit-learn regression machine-learning-model ridge-regression






      share|improve this question













      share|improve this question











      share|improve this question




      share|improve this question










      asked 12 mins ago









      tsumaranainatsumaranaina

      6510




      6510






















          0






          active

          oldest

          votes











          Your Answer





          StackExchange.ifUsing("editor", function () {
          return StackExchange.using("mathjaxEditing", function () {
          StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
          StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["$", "$"], ["\\(","\\)"]]);
          });
          });
          }, "mathjax-editing");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "557"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          autoActivateHeartbeat: false,
          convertImagesToLinks: false,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: null,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });














          draft saved

          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdatascience.stackexchange.com%2fquestions%2f46740%2fregression-code-with-a-very-high-train-score-and-low-test-score-how-can-i-enhan%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown

























          0






          active

          oldest

          votes








          0






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes
















          draft saved

          draft discarded




















































          Thanks for contributing an answer to Data Science Stack Exchange!


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          Use MathJax to format equations. MathJax reference.


          To learn more, see our tips on writing great answers.




          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdatascience.stackexchange.com%2fquestions%2f46740%2fregression-code-with-a-very-high-train-score-and-low-test-score-how-can-i-enhan%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          Ponta tanko

          Tantalo (mitologio)

          Erzsébet Schaár