# Load the necessary libraries

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Load the dataset and Select only the specified columns
df = pd.read_csv('vehicles2.csv')


columns_to_keep = [
    'price', 'year', 'manufacturer', 'condition', 'cylinders',
    'fuel', 'odometer', 'title_status', 'transmission',
    'drive', 'type', 'paint_color', 'description'
]
df = df[columns_to_keep]


print('Data before processing')
print(df.info())

# Handle missing values
df['year'].fillna(0, inplace=True)
df['manufacturer'].fillna('Unknown', inplace=True)
df['condition'].fillna('Unknown', inplace=True)
df['cylinders'].fillna('Unknown', inplace=True)
df['fuel'].fillna('Unknown', inplace=True)
df['odometer'].fillna(0, inplace=True)
df['title_status'].fillna('Unknown', inplace=True)
df['transmission'].fillna('Unknown', inplace=True)
df['drive'].fillna('', inplace=True)
df['type'].fillna('Unknown', inplace=True)
df['paint_color'].fillna('Unknown', inplace=True)
df['description'].fillna('Unknown', inplace=True)


print('Data after processing')
print(df.info())

#Convert type to numeric value
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

# Convert categorical variables to numerical ones
categorical_columns = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_columns)


# Split the data into features and target
X = df.drop('type', axis=1)  
y = df['type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Data before processing
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         10000 non-null  int64  
 1   year          9887 non-null   float64
 2   manufacturer  9517 non-null   object 
 3   condition     5347 non-null   object 
 4   cylinders     6713 non-null   object 
 5   fuel          9878 non-null   object 
 6   odometer      9942 non-null   float64
 7   title_status  9753 non-null   object 
 8   transmission  9917 non-null   object 
 9   drive         7101 non-null   object 
 10  type          8149 non-null   object 
 11  paint_color   7984 non-null   object 
 12  description   9973 non-null   object 
dtypes: float64(2), int64(1), object(10)
memory usage: 1015.8+ KB
None
Data after processing
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         10000 non-null  int64  
 1   year          10000 non-null  float64
 2   manufacturer  10000 non-null  object 
 3   condition     10000 non-null  object 
 4   cylinders     10000 non-null  object 
 5   fuel          10000 non-null  object 
 6   odometer      10000 non-null  float64
 7   title_status  10000 non-null  object 
 8   transmission  10000 non-null  object 
 9   drive         10000 non-null  object 
 10  type          10000 non-null  object 
 11  paint_color   10000 non-null  object 
 12  description   10000 non-null  object 
dtypes: float64(2), int64(1), object(10)
memory usage: 1015.8+ KB
None

# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train the Logistic Regression model
logreg = LogisticRegression(solver='liblinear', max_iter=2000)
logreg.fit(X_train, y_train)

# Evaluate the model
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print(f'Logistic Regression - Accuracy: {accuracy:.2f}')
print(f'Logistic Regression - Precision: {precision:.2f}')
print(f'Logistic Regression - Recall: {recall:.2f}')
print(f'Logistic Regression - F1 Score: {f1:.2f}')

LogisticRegression(max_iter=2000, solver='liblinear')

LogisticRegression(max_iter=2000, solver='liblinear')

Logistic Regression - Accuracy: 0.27
Logistic Regression - Precision: 0.29
Logistic Regression - Recall: 0.27
Logistic Regression - F1 Score: 0.17

# Import necessary libraries
from sklearn.tree import DecisionTreeClassifier

# Train the Decision Tree model
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

# Evaluate the model
y_pred = dtree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print(f'Decision Tree - Accuracy: {accuracy:.2f}')
print(f'Decision Tree - Precision: {precision:.2f}')
print(f'Decision Tree - Recall: {recall:.2f}')
print(f'Decision Tree - F1 Score: {f1:.2f}')

DecisionTreeClassifier()

DecisionTreeClassifier()

Decision Tree - Accuracy: 0.73
Decision Tree - Precision: 0.73
Decision Tree - Recall: 0.73
Decision Tree - F1 Score: 0.73

# Import necessary libraries
from sklearn.naive_bayes import GaussianNB

# Train the Naive Bayes model
nb = GaussianNB()
nb.fit(X_train, y_train)

# Evaluate the model
y_pred = nb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print(f'Naive Bayes - Accuracy: {accuracy:.2f}')
print(f'Naive Bayes - Precision: {precision:.2f}')
print(f'Naive Bayes - Recall: {recall:.2f}')
print(f'Naive Bayes - F1 Score: {f1:.2f}')

GaussianNB()

GaussianNB()

Naive Bayes - Accuracy: 0.22
Naive Bayes - Precision: 0.18
Naive Bayes - Recall: 0.22
Naive Bayes - F1 Score: 0.13

Vehicle Multi-Class Classification Using Machine Learning Algorithms