Data Analyst
Multiple linear regression to predict Profit based on Administration spend, Marketing spends, R&D spend and State. Explain the trend and correlation between dependent and independent variables.
Click here to download the raw data and the python code to follow along.
The solution guide is available too.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
dataset = pd.read_csv('50_Startups.csv')
print(dataset.head()) # top 5 rows of the data set'
print(dataset.info()) # general information about the dataset. Fill missing values.
print(dataset.describe()) # summary statistics
Profitperstate = dataset.groupby('State')['Profit'].mean().sort_values(ascending=False)
print(Profitperstate.head())
X = dataset.iloc[:, :-1] #all values from all rows and all columns excluding the last one
y = dataset.iloc[:, 4] #all values from rows in the last column
#Convert the column into categorical columns
plt.figure(figsize=( 8,8))
sns.heatmap(X.corr(),annot=True, cmap="coolwarm")
states=pd.get_dummies(X['State'],drop_first=True)
X=X.drop('State',axis=1)
X=pd.concat([X,states],axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
from sklearn.metrics import r2_score
score=r2_score(y_test,y_pred)
print(score)