









Study with the several resources on Docsity
Earn points by helping other students or get them with a premium plan
Prepare for your exams
Study with the several resources on Docsity
Earn points to download
Earn points by helping other students or get them with a premium plan
A comprehensive tutorial on applying various machine learning techniques to predict student exam scores. It covers the implementation of linear regression with gradient descent, linear regression with least squares, and polynomial regression using python. The document guides the reader through data preprocessing, model initialization, cost function definition, gradient descent implementation, and model evaluation. It also includes insights and comparisons between the different regression methods. Likely to be useful for university students studying machine learning, data science, or applied statistics, as it provides hands-on experience with implementing and evaluating regression models on a real-world dataset.
Typology: Assignments
1 / 15
This page cannot be seen from the preview
Don't miss anything!










import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer import matplotlib.pyplot as plt
url = 'https://drive.google.com/uc?id=1TwOizNpaHfITQK_kWbFVBTlHAw0e1bxW' data = pd.read_csv(url)
print(data.head())
print(data.info())
print(data.isnull().sum())
X = data.drop('Exam_Scores', axis=1) y = data['Exam_Scores']
categorical_features = ['Parental_Education', 'Ethnicity'] numeric_features = ['Hours_Studied', 'Previous_Exams']
numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())]) categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')),
def compute_cost(X, y, weights, bias): m = X.shape[0] predictions = X.dot(weights) + bias cost = (1 / (2 * m)) * np.sum((predictions - y) ** 2) return cost initial_cost = compute_cost(X_train, y_train, weights, bias) print("Initial cost:", initial_cost)
def gradient_descent(X, y, weights, bias, learning_rate, iterations): m = X.shape[0] cost_history = [] for i in range(iterations): predictions = X.dot(weights) + bias error = predictions - y dW = (1 / m) * X.T.dot(error)
db = (1 / m) * np.sum(error) weights - = learning_rate * dW bias - = learning_rate * db cost = compute_cost(X, y, wei ghts, bias) cost_history.append(cost) if i % 100 == 0: print(f"Iteration {i}: Cost {cost}") return weights, bias, cost_history
learning_rate = 0. iterations = 1000
weights, bias, cost_history = gradient_descent(X_train, y_train, weights, bias, learning_rate, iterations) print("Final weights:", weights) print("Final bias:", bias)
X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train] X_val_b = np.c_[np.ones((X_val.shape[0], 1)), X_val]
weights_b = np.linalg.inv(X_train_b.T.dot(X_train_b)).dot(X_train_b.T).dot(y_train) print("Weights (including bias):", weights_b)
y_pred_ls = X_val_b.dot(weights_b)
mse_ls = mean_squared_error(y_val, y_pred_ls) print("MSE (Least Squares):", mse_ls)
from sklearn.linear_model import Ridge from sklearn.preprocessing import PolynomialFeatures
degree = 2
poly = PolynomialFeatures(degree) X_poly_train = poly.fit_transform(X_train) X_poly_val = poly.transform(X_val)
ridge_reg = Ridge(alpha=0.01) ridge_reg.fit(X_poly_train, y_train)
y_pred_poly_ridge = ridge_reg.predict(X_poly_val)
mse_poly_ridge = mean_squared_error(y_val, y_pred_poly_ridge) print(f"MSE (Polynomial Regression with Ridge, degree={degree}):", mse_poly_ridge)
plt.scatter(y_val, y_pred_poly_ridge, c='orange', label='Predicted') plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2, label='Actual')
from sklearn.linear_model import Ridge from sklearn.preprocessing import PolynomialFeatures
degree = 2
poly = PolynomialFeatures(degree) X_poly_train = poly.fit_transform(X_train) X_poly_val = poly.transform(X_val)
ridge_reg = Ridge(alpha=0.01) # You can adjust the alpha (regularization strength) ridge_reg.fit(X_poly_train, y_train)
y_pred_poly_ridge = ridge_reg.predict(X_poly_val)
mse_poly_ridge = mean_squared_error(y_val, y_pred_poly_ridge) print(f"MSE (Polynomial Regression with Ridge, degree={degree}):", mse_poly_ridge)
print("MSE Comparison:") print(f"Gradient Descent: {mse_gd}") print(f"Least Squares: {mse_ls}") print(f"Polynomial Regression (Ridge, degree={degree}): {mse_poly_ridge}")
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1) plt.scatter(y_val, y_pred, c='blue', label='Predicted') plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2, label='Actual') plt.xlabel('Actual Exam Scores') plt.ylabel('Predicted Exam Scores') plt.title('Linear Regression (Gradient Descent)') plt.legend()
plt.subplot(1, 3, 2) plt.scatter(y_val, y_pred_ls, c='green', label='Predicted') plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2, label='Actual') plt.xlabel('Actual Exam Scores') plt.ylabel('Predicted Exam Scores') plt.title('Linear Regression (Least Squares)') plt.legend()
plt.subplot(1, 3, 3) plt.scatter(y_val, y_pred_poly_ridge, c='orange', label='Predicted') plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2, label='Actual') plt.xlabel('Actual Exam Scores') plt.ylabel('Predicted Exam Scores') plt.title(f'Polynomial Regression with Ridge (degree={degree})') plt.legend()
from sklearn.metrics import mean_squared_error from sklearn.linear_model import LinearRegression from sklearn.pipeline import Pipeline, make_pipeline
class LinearRegressionGD: def init(self, learning_rate=0.01, n_iterations=1000): self.learning_rate = learning_rate self.n_iterations = n_iterations def fit(self, X, y): self.m, self.n = X.shape self.theta = np.zeros(self.n) self.bias = 0 self.cost_history = [] for _ in range(self.n_iterations): y_pred = np.dot(X, self.theta) + self.bias cost = (1/(2self.m)) * np.sum((y_pred - y)*2) self.cost_history.append(cost) d_theta = (1/self.m) * np.dot(X.T, (y_pred - y)) d_bias = (1/self.m) * np.sum(y_pred - y) self.theta - = self.learning_rate * d_theta self.bias - = self.learning_rate * d_bias
def predict(self, X): return np.dot(X, self.theta) + self.bias
model_gd = LinearRegressionGD(learning_rate=0.01, n_iterations=1000) model_gd.fit(X_train, y_train)
y_pred_gd = model_gd.predict(X_val)
mse_gd = np.mean((y_pred_gd - y_val) ** 2) print("Mean Squared Error (Gradient Descent):", mse_gd)
print("Coefficients (Gradient Descent):", model_gd.theta) print("Intercept (Gradient Descent):", model_gd.bias)
model_ls = LinearRegression() model_ls.fit(X_train, y_train)
y_pred_ls = model_ls.predict(X_val)
mse_ls = np.mean((y_pred_ls - y_val) ** 2) print("Mean Squared Error (Least Squares):", mse_ls)