In [13]:
# Load the necessary libraries
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Load the dataset and Select only the specified columns
df = pd.read_csv('vehicles2.csv')
columns_to_keep = [
'price', 'year', 'manufacturer', 'condition', 'cylinders',
'fuel', 'odometer', 'title_status', 'transmission',
'drive', 'type', 'paint_color', 'description'
]
df = df[columns_to_keep]
print('Data before processing')
print(df.info())
# Handle missing values
df['year'].fillna(0, inplace=True)
df['manufacturer'].fillna('Unknown', inplace=True)
df['condition'].fillna('Unknown', inplace=True)
df['cylinders'].fillna('Unknown', inplace=True)
df['fuel'].fillna('Unknown', inplace=True)
df['odometer'].fillna(0, inplace=True)
df['title_status'].fillna('Unknown', inplace=True)
df['transmission'].fillna('Unknown', inplace=True)
df['drive'].fillna('', inplace=True)
df['type'].fillna('Unknown', inplace=True)
df['paint_color'].fillna('Unknown', inplace=True)
df['description'].fillna('Unknown', inplace=True)
print('Data after processing')
print(df.info())
#Convert type to numeric value
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])
# Convert categorical variables to numerical ones
categorical_columns = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_columns)
# Split the data into features and target
X = df.drop('type', axis=1)
y = df['type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
Data before processing <class 'pandas.core.frame.DataFrame'> RangeIndex: 10000 entries, 0 to 9999 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 price 10000 non-null int64 1 year 9887 non-null float64 2 manufacturer 9517 non-null object 3 condition 5347 non-null object 4 cylinders 6713 non-null object 5 fuel 9878 non-null object 6 odometer 9942 non-null float64 7 title_status 9753 non-null object 8 transmission 9917 non-null object 9 drive 7101 non-null object 10 type 8149 non-null object 11 paint_color 7984 non-null object 12 description 9973 non-null object dtypes: float64(2), int64(1), object(10) memory usage: 1015.8+ KB None Data after processing <class 'pandas.core.frame.DataFrame'> RangeIndex: 10000 entries, 0 to 9999 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 price 10000 non-null int64 1 year 10000 non-null float64 2 manufacturer 10000 non-null object 3 condition 10000 non-null object 4 cylinders 10000 non-null object 5 fuel 10000 non-null object 6 odometer 10000 non-null float64 7 title_status 10000 non-null object 8 transmission 10000 non-null object 9 drive 10000 non-null object 10 type 10000 non-null object 11 paint_color 10000 non-null object 12 description 10000 non-null object dtypes: float64(2), int64(1), object(10) memory usage: 1015.8+ KB None
In [ ]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Train the Logistic Regression model
logreg = LogisticRegression(solver='liblinear', max_iter=2000)
logreg.fit(X_train, y_train)
# Evaluate the model
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
print(f'Logistic Regression - Accuracy: {accuracy:.2f}')
print(f'Logistic Regression - Precision: {precision:.2f}')
print(f'Logistic Regression - Recall: {recall:.2f}')
print(f'Logistic Regression - F1 Score: {f1:.2f}')
Out[ ]:
LogisticRegression(max_iter=2000, solver='liblinear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=2000, solver='liblinear')
Logistic Regression - Accuracy: 0.27 Logistic Regression - Precision: 0.29 Logistic Regression - Recall: 0.27 Logistic Regression - F1 Score: 0.17
In [15]:
# Import necessary libraries
from sklearn.tree import DecisionTreeClassifier
# Train the Decision Tree model
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
# Evaluate the model
y_pred = dtree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
print(f'Decision Tree - Accuracy: {accuracy:.2f}')
print(f'Decision Tree - Precision: {precision:.2f}')
print(f'Decision Tree - Recall: {recall:.2f}')
print(f'Decision Tree - F1 Score: {f1:.2f}')
Out[15]:
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
Decision Tree - Accuracy: 0.73 Decision Tree - Precision: 0.73 Decision Tree - Recall: 0.73 Decision Tree - F1 Score: 0.73
In [16]:
# Import necessary libraries
from sklearn.naive_bayes import GaussianNB
# Train the Naive Bayes model
nb = GaussianNB()
nb.fit(X_train, y_train)
# Evaluate the model
y_pred = nb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
print(f'Naive Bayes - Accuracy: {accuracy:.2f}')
print(f'Naive Bayes - Precision: {precision:.2f}')
print(f'Naive Bayes - Recall: {recall:.2f}')
print(f'Naive Bayes - F1 Score: {f1:.2f}')
Out[16]:
GaussianNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GaussianNB()
Naive Bayes - Accuracy: 0.22 Naive Bayes - Precision: 0.18 Naive Bayes - Recall: 0.22 Naive Bayes - F1 Score: 0.13