Regression code with a very high train score and low test score, how can i enhance my code?
$begingroup$
I am performing regression analysis on some data. I keep getting very high training score and low test score. My code is below, what can i do to enhance it? Thank you in advance.
# coding: utf-8
# In[1]:
#Importing modules
import sys
import math
import itertools
import numpy as np
import pandas as pd
from numpy import genfromtxt
from matplotlib import style
import matplotlib.pyplot as plt
from sklearn import linear_model
from matplotlib import style, figure
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
# In[2]:
#Importing data
df = np.genfromtxt('/Users/Studies/Machine_learning/reactivity/main_us.csv', delimiter=',')
#To skip the header ad skiprpws=0
# In[3]:
X = df[0:,1:306]
y = df[0:,0]
# In[4]:
print (X).shape
print (y).shape
display (X)
display (y)
print (y)
# In[5]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=4)
# In[6]:
#Apply StandardScaler for feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)
print len(X_test), len(y_test)
# In[7]:
#Applying PCA for dimnetionality reduction
from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
#Checking shape after scaling
print ("Checking shape after scaling")
print (X_train.shape)
print (X_test.shape)
#Variance/Values
print("Explained_variance_ratio")
print(pca.explained_variance_ratio_)
print("Singular_values")
print(pca.singular_values_)
#Plotting
print ("Graph")
plt.scatter (X_train[:,0], X_train[:,1], c=y_train, edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('rainbow',6))
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.colorbar();
print ('You are looking at a high dimentional data explained by 2 components')
print ('Eeven though these components hold some information, but this to seperate the components apart')
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
#Checking shape after scaling
print (X_train.shape)
print (y_train.shape)
print (X_train.shape)
# In[8]:
alphas = 10**np.linspace(10,-2,100)*0.5
alphas
# In[9]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge, Lasso
for Model in [Ridge, Lasso]:
model = Model()
print('%s: %s' % (Model.__name__,
cross_val_score(model, X, y).mean()))
# Out[9]:
Ridge: -1.3841312374053019
Lasso: -1.164517926682712
# In[10]:
import numpy as np
from matplotlib import pyplot as plt
alphas = np.logspace(-3, -1, 30)
plt.figure(figsize=(5, 3))
for Model in [Lasso, Ridge]:
scores = [cross_val_score(Model(alpha), X, y, cv=3).mean()
for alpha in alphas]
plt.plot(alphas, scores, label=Model.__name__)
plt.legend(loc='lower left')
plt.xlabel('alpha')
plt.ylabel('cross validation score')
plt.tight_layout()
plt.show()
# In[11]:
# alpha = 0.1
model = Ridge(alpha = 0.1)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# alpha = 0.01
model1 = Ridge(alpha = 0.01)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# alpha = 0.001
model2 = Ridge(alpha = 0.001)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# alpha = 0.0001
model3 = Ridge(alpha = 0.0001)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# Out[11]:
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
# In[12]:
modelCV = RidgeCV(alphas = [0.1, 0.01, 0.001,0.0001], store_cv_values = True)
modelCV.fit(X_train,y_train)
modelCV.alpha_ #giving 0.1
print modelCV.score(X_train,y_train) # giving 0.36898424479812919 which is the same score as ridge regression with alpha = 0.1
print modelCV.score(X_test,y_test)
# Out[12]:
0.9999996833724951
-0.41203227638984496
machine-learning scikit-learn regression machine-learning-model ridge-regression
$endgroup$
add a comment |
$begingroup$
I am performing regression analysis on some data. I keep getting very high training score and low test score. My code is below, what can i do to enhance it? Thank you in advance.
# coding: utf-8
# In[1]:
#Importing modules
import sys
import math
import itertools
import numpy as np
import pandas as pd
from numpy import genfromtxt
from matplotlib import style
import matplotlib.pyplot as plt
from sklearn import linear_model
from matplotlib import style, figure
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
# In[2]:
#Importing data
df = np.genfromtxt('/Users/Studies/Machine_learning/reactivity/main_us.csv', delimiter=',')
#To skip the header ad skiprpws=0
# In[3]:
X = df[0:,1:306]
y = df[0:,0]
# In[4]:
print (X).shape
print (y).shape
display (X)
display (y)
print (y)
# In[5]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=4)
# In[6]:
#Apply StandardScaler for feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)
print len(X_test), len(y_test)
# In[7]:
#Applying PCA for dimnetionality reduction
from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
#Checking shape after scaling
print ("Checking shape after scaling")
print (X_train.shape)
print (X_test.shape)
#Variance/Values
print("Explained_variance_ratio")
print(pca.explained_variance_ratio_)
print("Singular_values")
print(pca.singular_values_)
#Plotting
print ("Graph")
plt.scatter (X_train[:,0], X_train[:,1], c=y_train, edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('rainbow',6))
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.colorbar();
print ('You are looking at a high dimentional data explained by 2 components')
print ('Eeven though these components hold some information, but this to seperate the components apart')
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
#Checking shape after scaling
print (X_train.shape)
print (y_train.shape)
print (X_train.shape)
# In[8]:
alphas = 10**np.linspace(10,-2,100)*0.5
alphas
# In[9]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge, Lasso
for Model in [Ridge, Lasso]:
model = Model()
print('%s: %s' % (Model.__name__,
cross_val_score(model, X, y).mean()))
# Out[9]:
Ridge: -1.3841312374053019
Lasso: -1.164517926682712
# In[10]:
import numpy as np
from matplotlib import pyplot as plt
alphas = np.logspace(-3, -1, 30)
plt.figure(figsize=(5, 3))
for Model in [Lasso, Ridge]:
scores = [cross_val_score(Model(alpha), X, y, cv=3).mean()
for alpha in alphas]
plt.plot(alphas, scores, label=Model.__name__)
plt.legend(loc='lower left')
plt.xlabel('alpha')
plt.ylabel('cross validation score')
plt.tight_layout()
plt.show()
# In[11]:
# alpha = 0.1
model = Ridge(alpha = 0.1)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# alpha = 0.01
model1 = Ridge(alpha = 0.01)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# alpha = 0.001
model2 = Ridge(alpha = 0.001)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# alpha = 0.0001
model3 = Ridge(alpha = 0.0001)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# Out[11]:
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
# In[12]:
modelCV = RidgeCV(alphas = [0.1, 0.01, 0.001,0.0001], store_cv_values = True)
modelCV.fit(X_train,y_train)
modelCV.alpha_ #giving 0.1
print modelCV.score(X_train,y_train) # giving 0.36898424479812919 which is the same score as ridge regression with alpha = 0.1
print modelCV.score(X_test,y_test)
# Out[12]:
0.9999996833724951
-0.41203227638984496
machine-learning scikit-learn regression machine-learning-model ridge-regression
$endgroup$
add a comment |
$begingroup$
I am performing regression analysis on some data. I keep getting very high training score and low test score. My code is below, what can i do to enhance it? Thank you in advance.
# coding: utf-8
# In[1]:
#Importing modules
import sys
import math
import itertools
import numpy as np
import pandas as pd
from numpy import genfromtxt
from matplotlib import style
import matplotlib.pyplot as plt
from sklearn import linear_model
from matplotlib import style, figure
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
# In[2]:
#Importing data
df = np.genfromtxt('/Users/Studies/Machine_learning/reactivity/main_us.csv', delimiter=',')
#To skip the header ad skiprpws=0
# In[3]:
X = df[0:,1:306]
y = df[0:,0]
# In[4]:
print (X).shape
print (y).shape
display (X)
display (y)
print (y)
# In[5]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=4)
# In[6]:
#Apply StandardScaler for feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)
print len(X_test), len(y_test)
# In[7]:
#Applying PCA for dimnetionality reduction
from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
#Checking shape after scaling
print ("Checking shape after scaling")
print (X_train.shape)
print (X_test.shape)
#Variance/Values
print("Explained_variance_ratio")
print(pca.explained_variance_ratio_)
print("Singular_values")
print(pca.singular_values_)
#Plotting
print ("Graph")
plt.scatter (X_train[:,0], X_train[:,1], c=y_train, edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('rainbow',6))
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.colorbar();
print ('You are looking at a high dimentional data explained by 2 components')
print ('Eeven though these components hold some information, but this to seperate the components apart')
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
#Checking shape after scaling
print (X_train.shape)
print (y_train.shape)
print (X_train.shape)
# In[8]:
alphas = 10**np.linspace(10,-2,100)*0.5
alphas
# In[9]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge, Lasso
for Model in [Ridge, Lasso]:
model = Model()
print('%s: %s' % (Model.__name__,
cross_val_score(model, X, y).mean()))
# Out[9]:
Ridge: -1.3841312374053019
Lasso: -1.164517926682712
# In[10]:
import numpy as np
from matplotlib import pyplot as plt
alphas = np.logspace(-3, -1, 30)
plt.figure(figsize=(5, 3))
for Model in [Lasso, Ridge]:
scores = [cross_val_score(Model(alpha), X, y, cv=3).mean()
for alpha in alphas]
plt.plot(alphas, scores, label=Model.__name__)
plt.legend(loc='lower left')
plt.xlabel('alpha')
plt.ylabel('cross validation score')
plt.tight_layout()
plt.show()
# In[11]:
# alpha = 0.1
model = Ridge(alpha = 0.1)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# alpha = 0.01
model1 = Ridge(alpha = 0.01)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# alpha = 0.001
model2 = Ridge(alpha = 0.001)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# alpha = 0.0001
model3 = Ridge(alpha = 0.0001)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# Out[11]:
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
# In[12]:
modelCV = RidgeCV(alphas = [0.1, 0.01, 0.001,0.0001], store_cv_values = True)
modelCV.fit(X_train,y_train)
modelCV.alpha_ #giving 0.1
print modelCV.score(X_train,y_train) # giving 0.36898424479812919 which is the same score as ridge regression with alpha = 0.1
print modelCV.score(X_test,y_test)
# Out[12]:
0.9999996833724951
-0.41203227638984496
machine-learning scikit-learn regression machine-learning-model ridge-regression
$endgroup$
I am performing regression analysis on some data. I keep getting very high training score and low test score. My code is below, what can i do to enhance it? Thank you in advance.
# coding: utf-8
# In[1]:
#Importing modules
import sys
import math
import itertools
import numpy as np
import pandas as pd
from numpy import genfromtxt
from matplotlib import style
import matplotlib.pyplot as plt
from sklearn import linear_model
from matplotlib import style, figure
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
# In[2]:
#Importing data
df = np.genfromtxt('/Users/Studies/Machine_learning/reactivity/main_us.csv', delimiter=',')
#To skip the header ad skiprpws=0
# In[3]:
X = df[0:,1:306]
y = df[0:,0]
# In[4]:
print (X).shape
print (y).shape
display (X)
display (y)
print (y)
# In[5]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=4)
# In[6]:
#Apply StandardScaler for feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)
print len(X_test), len(y_test)
# In[7]:
#Applying PCA for dimnetionality reduction
from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
#Checking shape after scaling
print ("Checking shape after scaling")
print (X_train.shape)
print (X_test.shape)
#Variance/Values
print("Explained_variance_ratio")
print(pca.explained_variance_ratio_)
print("Singular_values")
print(pca.singular_values_)
#Plotting
print ("Graph")
plt.scatter (X_train[:,0], X_train[:,1], c=y_train, edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('rainbow',6))
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.colorbar();
print ('You are looking at a high dimentional data explained by 2 components')
print ('Eeven though these components hold some information, but this to seperate the components apart')
print(pca.explained_variance_ratio_)
print(pca.singular_values_)
#Checking shape after scaling
print (X_train.shape)
print (y_train.shape)
print (X_train.shape)
# In[8]:
alphas = 10**np.linspace(10,-2,100)*0.5
alphas
# In[9]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge, Lasso
for Model in [Ridge, Lasso]:
model = Model()
print('%s: %s' % (Model.__name__,
cross_val_score(model, X, y).mean()))
# Out[9]:
Ridge: -1.3841312374053019
Lasso: -1.164517926682712
# In[10]:
import numpy as np
from matplotlib import pyplot as plt
alphas = np.logspace(-3, -1, 30)
plt.figure(figsize=(5, 3))
for Model in [Lasso, Ridge]:
scores = [cross_val_score(Model(alpha), X, y, cv=3).mean()
for alpha in alphas]
plt.plot(alphas, scores, label=Model.__name__)
plt.legend(loc='lower left')
plt.xlabel('alpha')
plt.ylabel('cross validation score')
plt.tight_layout()
plt.show()
# In[11]:
# alpha = 0.1
model = Ridge(alpha = 0.1)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# alpha = 0.01
model1 = Ridge(alpha = 0.01)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# alpha = 0.001
model2 = Ridge(alpha = 0.001)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# alpha = 0.0001
model3 = Ridge(alpha = 0.0001)
model.fit(X_train,y_train)
print model.score(X_train,y_train)
print model.score(X_test,y_test)
# Out[11]:
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
0.9999996833724945
-0.4120322763917558
# In[12]:
modelCV = RidgeCV(alphas = [0.1, 0.01, 0.001,0.0001], store_cv_values = True)
modelCV.fit(X_train,y_train)
modelCV.alpha_ #giving 0.1
print modelCV.score(X_train,y_train) # giving 0.36898424479812919 which is the same score as ridge regression with alpha = 0.1
print modelCV.score(X_test,y_test)
# Out[12]:
0.9999996833724951
-0.41203227638984496
machine-learning scikit-learn regression machine-learning-model ridge-regression
machine-learning scikit-learn regression machine-learning-model ridge-regression
asked 12 mins ago
tsumaranainatsumaranaina
6510
6510
add a comment |
add a comment |
0
active
oldest
votes
Your Answer
StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["$", "$"], ["\\(","\\)"]]);
});
});
}, "mathjax-editing");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "557"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdatascience.stackexchange.com%2fquestions%2f46740%2fregression-code-with-a-very-high-train-score-and-low-test-score-how-can-i-enhan%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
0
active
oldest
votes
0
active
oldest
votes
active
oldest
votes
active
oldest
votes
Thanks for contributing an answer to Data Science Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdatascience.stackexchange.com%2fquestions%2f46740%2fregression-code-with-a-very-high-train-score-and-low-test-score-how-can-i-enhan%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown