#Import libraries
import pandas as pd
import numpy as np
import re
import seaborn as sns 
import matplotlib.pyplot as plt
from scipy.stats import zscore, chi2_contingency
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

#Display the dataframe after loading the csv file
traf_df = pd.read_csv("Traffic_Violations.csv", low_memory=False)
traf_df.head()

#Display the column list
traf_df.columns

Index(['Date Of Stop', 'Time Of Stop', 'Agency', 'SubAgency', 'Description',
       'Location', 'Latitude', 'Longitude', 'Accident', 'Belts',
       'Personal Injury', 'Property Damage', 'Fatal', 'Commercial License',
       'HAZMAT', 'Commercial Vehicle', 'Alcohol', 'Work Zone', 'State',
       'VehicleType', 'Year', 'Make', 'Model', 'Color', 'Violation Type',
       'Charge', 'Article', 'Contributed To Accident', 'Race', 'Gender',
       'Driver City', 'Driver State', 'DL State', 'Arrest Type',
       'Geolocation'],
      dtype='object')

#Display the datatype the original dataset
traf_df.dtypes

Date Of Stop                object
Time Of Stop                object
Agency                      object
SubAgency                   object
Description                 object
Location                    object
Latitude                   float64
Longitude                  float64
Accident                    object
Belts                       object
Personal Injury             object
Property Damage             object
Fatal                       object
Commercial License          object
HAZMAT                      object
Commercial Vehicle          object
Alcohol                     object
Work Zone                   object
State                       object
VehicleType                 object
Year                       float64
Make                        object
Model                       object
Color                       object
Violation Type              object
Charge                      object
Article                     object
Contributed To Accident     object
Race                        object
Gender                      object
Driver City                 object
Driver State                object
DL State                    object
Arrest Type                 object
Geolocation                 object
dtype: object

#Display the number of rows and columns
number_of_rows, number_of_columns = traf_df.shape
print("Number of rows: ", number_of_rows)
print("Number of columns: ", number_of_columns)

Number of rows:  1292399
Number of columns:  35

#Use a count function to identify which row has a missing value
traf_df.count()

Date Of Stop               1292399
Time Of Stop               1292399
Agency                     1292399
SubAgency                  1292389
Description                1292390
Location                   1292397
Latitude                   1197045
Longitude                  1197045
Accident                   1292399
Belts                      1292399
Personal Injury            1292399
Property Damage            1292399
Fatal                      1292399
Commercial License         1292399
HAZMAT                     1292399
Commercial Vehicle         1292399
Alcohol                    1292399
Work Zone                  1292399
State                      1292340
VehicleType                1292399
Year                       1284325
Make                       1292342
Model                      1292212
Color                      1276272
Violation Type             1292399
Charge                     1292399
Article                    1227230
Contributed To Accident    1292399
Race                       1292399
Gender                     1292399
Driver City                1292182
Driver State               1292388
DL State                   1291470
Arrest Type                1292399
Geolocation                1197045
dtype: int64

duplicate_rows = traf_df.duplicated()
print("Duplicate entries: ", traf_df[duplicate_rows].shape[0])

Duplicate entries:  1588

# Check which columns have a missing vlaue
missing_values_count = traf_df.isnull().sum()
missing_values_count

Date Of Stop                   0
Time Of Stop                   0
Agency                         0
SubAgency                     10
Description                    9
Location                       2
Latitude                   95354
Longitude                  95354
Accident                       0
Belts                          0
Personal Injury                0
Property Damage                0
Fatal                          0
Commercial License             0
HAZMAT                         0
Commercial Vehicle             0
Alcohol                        0
Work Zone                      0
State                         59
VehicleType                    0
Year                        8074
Make                          57
Model                        187
Color                      16127
Violation Type                 0
Charge                         0
Article                    65169
Contributed To Accident        0
Race                           0
Gender                         0
Driver City                  217
Driver State                  11
DL State                     929
Arrest Type                    0
Geolocation                95354
dtype: int64

# Remove the duplicate rows
traf_df = traf_df.drop_duplicates()

# Remove the missing values
traf_df = traf_df.dropna()

# Convert the data type of date to datetime type
traf_df['Date Of Stop'] = pd.to_datetime(traf_df['Date Of Stop'], format='%m/%d/%Y', dayfirst=True)

# change the data type of time to datetime
traf_df['Time Of Stop'] = pd.to_datetime(traf_df['Time Of Stop'], format="%H:%M:%S")

traf_df['Year'] = traf_df['Year'].astype('int32')

# Check the updates so far
traf_df.head()

# Drop unnecessary feature for our analysis
traf_df.drop(['Geolocation', 'Agency', 'Article', 'Color','Accident'], axis='columns', inplace=True)

# Change the value 'Yes' into 1 and 'No' into 0 for some features (One-hot Encoding)
traf_df['Belts'] = traf_df['Belts'].map({'Yes': 1, 'No': 0})
traf_df['Personal Injury'] = traf_df['Personal Injury'].map({'Yes': 1, 'No': 0})
traf_df['Property Damage'] = traf_df['Property Damage'].map({'Yes': 1, 'No': 0})
traf_df['Commercial License'] = traf_df['Commercial License'].map({'Yes': 1, 'No': 0})
traf_df['Commercial Vehicle'] = traf_df['Commercial Vehicle'].map({'Yes': 1, 'No': 0})
traf_df['Alcohol'] = traf_df['Alcohol'].map({'Yes': 1, 'No': 0})
traf_df['Work Zone'] = traf_df['Work Zone'].map({'Yes': 1, 'No': 0})
traf_df['HAZMAT'] = traf_df['HAZMAT'].map({'Yes': 1, 'No': 0})
traf_df['Fatal'] = traf_df['Fatal'].map({'Yes': 1, 'No': 0})
traf_df['Contributed To Accident'] = traf_df['Contributed To Accident'].map({'Yes': 1, 'No': 0})

# Now Check the cleaned dataframe
traf_df

categorical_summary = traf_df.describe(include=[object])
categorical_summary

traf_df['Violation Type'].value_counts()

Warning     577392
Citation    541601
SERO            23
Name: Violation Type, dtype: int64

plt.figure(figsize=(10, 6))
traf_df['Violation Type'].value_counts().plot(kind='bar')
plt.title('Distribution of Violation Types')
plt.xlabel('Violation Type')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

traf_df = traf_df[traf_df['Violation Type'] != 'SERO']

race_violation_counts = traf_df.groupby('Race')['Violation Type'].value_counts().unstack(fill_value=0)
print(race_violation_counts)

Violation Type   Citation  Warning
Race                              
ASIAN               26838    38876
BLACK              175876   176953
HISPANIC           129647   101815
NATIVE AMERICAN      1165     1450
OTHER               25700    35042
WHITE              182375   223256

plt.figure(figsize=(14, 8))
race_violation_counts.plot(kind='bar')
plt.title('Counts of Violation Types by Race')
plt.xlabel('Race')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Violation Type')
plt.show()

<Figure size 1400x800 with 0 Axes>

race_violation_counts = traf_df.groupby(['Race', 'Violation Type']).size().unstack(fill_value=0)

# Calculate the proportions of each violation type within each race
race_violation_proportions = race_violation_counts.div(race_violation_counts.sum(axis=1), axis=0)
# Determine the number of rows and columns for the subplots
num_races = race_violation_proportions.shape[0]
num_cols = (num_races + 1) // 2
num_rows = 2

# Set up the figure and axes for the pie charts
fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, 10))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Create a pie chart for each race
for ax, (race, proportions) in zip(axes, race_violation_proportions.iterrows()):
    proportions.plot(kind='pie', ax=ax, autopct='%1.1f%%', startangle=140, legend=False)
    ax.set_title(f'Violation Type Proportions for {race}')
    ax.set_ylabel('')

# Remove any unused subplots
for i in range(num_races, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

traf_df = traf_df.copy()

traf_df['Year Incident'] = traf_df['Date Of Stop'].dt.year
traf_df['Hour'] = traf_df['Time Of Stop'].dt.hour


# Summary statistics for violation types by year
violation_by_year = traf_df.groupby('Year Incident')['Violation Type'].value_counts().unstack(fill_value=0)
print("Summary statistics for violation types by year:")
display(violation_by_year)

# Plot violation types by year
violation_by_year.plot(kind='bar')
plt.title('Violation Types by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.legend(title='Violation Type')
plt.show()

Summary statistics for violation types by year:

import matplotlib.pyplot as plt

# Grouping and summarizing the data
violation_by_hour = traf_df.groupby('Hour')['Violation Type'].value_counts().unstack(fill_value=0)
print("Summary statistics for violation types by hour:")
display(violation_by_hour)

# Plotting violation types by hour
ax = violation_by_hour.plot(figsize=(10, 6))
ax.set_title('Violation Types by Hour')
ax.set_xlabel('Hour of Day')
ax.set_ylabel('Count')
ax.legend(title='Violation Type')
ax.set_xticks(range(1, 25))
ax.xaxis.grid(True, which='both')  

plt.show()

Summary statistics for violation types by hour:

most_violations_year = traf_df.groupby(['Violation Type', 'Year']).size().reset_index(name='Counts')
most_violations_year = most_violations_year.loc[most_violations_year.groupby('Violation Type')['Counts'].idxmax()]

print("Year with the most violations for each type:")
print(most_violations_year)


# Find the hour with the most violations for each type
most_violations_hour = traf_df.groupby(['Violation Type', 'Hour']).size().reset_index(name='Counts')
most_violations_hour = most_violations_hour.loc[most_violations_hour.groupby('Violation Type')['Counts'].idxmax()]

print("Hour with the most violations for each type:")
print(most_violations_hour)

Year with the most violations for each type:
    Violation Type  Year  Counts
88        Citation  2006   33437
276        Warning  2007   35993
Hour with the most violations for each type:
   Violation Type  Hour  Counts
23       Citation    23   35399
46        Warning    22   51922

# figure and axes for the 2x2 grid of subplots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Plot 1: Gender vs Violation Type
gender_violation_counts = traf_df.groupby(['Gender', 'Violation Type']).size().unstack(fill_value=0)
gender_violation_proportions = gender_violation_counts.div(gender_violation_counts.sum(axis=1), axis=0)
gender_violation_counts.plot(kind='bar', stacked=True, ax=axes[0], color=['skyblue', 'lightgreen'])
axes[0].set_title('Violation Type Proportions by Gender')
axes[0].set_xlabel('Gender')
axes[0].set_ylabel('Proportion')
axes[0].legend(title='Violation Type')
axes[0].tick_params(axis='x', rotation=0)

# Plot 2: Work Zone vs Violation Type
work_zone_violation_counts = traf_df.groupby(['Work Zone', 'Violation Type']).size().unstack(fill_value=0)
work_zone_violation_proportions = work_zone_violation_counts.div(work_zone_violation_counts.sum(axis=1), axis=0)
work_zone_violation_counts.plot(kind='bar', stacked=True, ax=axes[1], color=['salmon', 'orchid'])
axes[1].set_title('Violation Type Proportions by Work Zone')
axes[1].set_xlabel('Work Zone')
axes[1].set_ylabel('Proportion')
axes[1].legend(title='Violation Type')
axes[1].tick_params(axis='x', rotation=0)

# Plot 3: Alcohol Involvement vs Violation Type
alcohol_violation_counts = traf_df.groupby(['Alcohol', 'Violation Type']).size().unstack(fill_value=0)
alcohol_violation_proportions = alcohol_violation_counts.div(alcohol_violation_counts.sum(axis=1), axis=0)
alcohol_violation_counts.plot(kind='bar', stacked=True, ax=axes[2], color=['lightcoral', 'lightblue'])
axes[2].set_title('Violation Type Proportions by Alcohol Involvement')
axes[2].set_xlabel('Alcohol Involvement')
axes[2].set_ylabel('Proportion')
axes[2].legend(title='Violation Type')
axes[2].tick_params(axis='x', rotation=0)

# Plot 4: Sub-Agency vs Violation Type
sub_agency_violation_counts = traf_df.groupby(['SubAgency', 'Violation Type']).size().unstack(fill_value=0)
sub_agency_violation_proportions = sub_agency_violation_counts.div(sub_agency_violation_counts.sum(axis=1), axis=0)
sub_agency_violation_counts.plot(kind='bar', stacked=True, ax=axes[3], color=['gold', 'teal'])
axes[3].set_title('Violation Type Proportions by Sub-Agency')
axes[3].set_xlabel('Sub-Agency')
axes[3].set_ylabel('Proportion')
axes[3].legend(title='Violation Type')
axes[3].tick_params(axis='x', rotation=90)

plt.tight_layout()
plt.show()

# figure and axes for the 2x2 grid of subplots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Plot 1: Gender vs Violation Type
gender_violation_counts = traf_df.groupby(['Gender', 'Violation Type']).size().unstack(fill_value=0)
gender_violation_proportions = gender_violation_counts.div(gender_violation_counts.sum(axis=1), axis=0)
print(gender_violation_proportions)
gender_violation_proportions.plot(kind='bar', stacked=True, ax=axes[0], color=['skyblue', 'lightgreen'])
axes[0].set_title('Violation Type Proportions by Gender')
axes[0].set_xlabel('Gender')
axes[0].set_ylabel('Proportion')
axes[0].legend(title='Violation Type')
axes[0].tick_params(axis='x', rotation=0)

# Plot 2: Work Zone vs Violation Type
work_zone_violation_counts = traf_df.groupby(['Work Zone', 'Violation Type']).size().unstack(fill_value=0)
work_zone_violation_proportions = work_zone_violation_counts.div(work_zone_violation_counts.sum(axis=1), axis=0)
print(work_zone_violation_proportions)
work_zone_violation_proportions.plot(kind='bar', stacked=True, ax=axes[1], color=['salmon', 'orchid'])
axes[1].set_title('Violation Type Proportions by Work Zone')
axes[1].set_xlabel('Work Zone')
axes[1].set_ylabel('Proportion')
axes[1].legend(title='Violation Type')
axes[1].tick_params(axis='x', rotation=0)

# Plot 3: Alcohol Involvement vs Violation Type
alcohol_violation_counts = traf_df.groupby(['Alcohol', 'Violation Type']).size().unstack(fill_value=0)
alcohol_violation_proportions = alcohol_violation_counts.div(alcohol_violation_counts.sum(axis=1), axis=0)
print(alcohol_violation_proportions)
alcohol_violation_proportions.plot(kind='bar', stacked=True, ax=axes[2], color=['lightcoral', 'lightblue'])
axes[2].set_title('Violation Type Proportions by Alcohol Involvement')
axes[2].set_xlabel('Alcohol Involvement')
axes[2].set_ylabel('Proportion')
axes[2].legend(title='Violation Type')
axes[2].tick_params(axis='x', rotation=0)

# Plot 4: Sub-Agency vs Violation Type
sub_agency_violation_counts = traf_df.groupby(['SubAgency', 'Violation Type']).size().unstack(fill_value=0)
sub_agency_violation_proportions = sub_agency_violation_counts.div(sub_agency_violation_counts.sum(axis=1), axis=0)
print(sub_agency_violation_proportions)
sub_agency_violation_proportions.plot(kind='bar', stacked=True, ax=axes[3], color=['gold', 'teal'])
axes[3].set_title('Violation Type Proportions by Sub-Agency')
axes[3].set_xlabel('Sub-Agency')
axes[3].set_ylabel('Proportion')
axes[3].legend(title='Violation Type')
axes[3].tick_params(axis='x', rotation=90)

plt.tight_layout()
plt.show()

Violation Type  Citation   Warning
Gender                            
F               0.431821  0.568179
M               0.510515  0.489485
U               0.152019  0.847981
Violation Type  Citation   Warning
Work Zone                         
0               0.483972  0.516028
1               0.656522  0.343478
Violation Type  Citation   Warning
Alcohol                           
0               0.483247  0.516753
1               0.940828  0.059172
Violation Type                                   Citation   Warning
SubAgency                                                          
1st district, Rockville                          0.424152  0.575848
2nd district, Bethesda                           0.541992  0.458008
3rd district, Silver Spring                      0.537274  0.462726
4th district, Wheaton                            0.478258  0.521742
5th district, Germantown                         0.446912  0.553088
6th district, Gaithersburg / Montgomery Village  0.419003  0.580997
Headquarters and Special Operations              0.543749  0.456251

# Contingency table for Race vs. Gender
contingency_gender = pd.crosstab(traf_df['Race'], traf_df['Gender'])

# Contingency table for Race vs. Work Zone
contingency_work_zone = pd.crosstab(traf_df['Race'], traf_df['Work Zone'])

# Contingency table for Race vs. Alcohol Involvement
contingency_alcohol = pd.crosstab(traf_df['Race'], traf_df['Alcohol'])

# Contingency table for Race vs. Sub-Agency
contingency_sub_agency = pd.crosstab(traf_df['Race'], traf_df['SubAgency'])

# Contingency table for Race vs. Hour of Stop
contingency_hour = pd.crosstab(traf_df['Race'], traf_df['Hour'])

chi2_gender, p_gender, dof_gender, ex_gender = chi2_contingency(contingency_gender)
print(f"Chi-squared test for Race vs. Gender: p-value = {p_gender}")

# Chi-squared test for Race vs. Work Zone
chi2_work_zone, p_work_zone, dof_work_zone, ex_work_zone = chi2_contingency(contingency_work_zone)
print(f"Chi-squared test for Race vs. Work Zone: p-value = {p_work_zone}")

# Chi-squared test for Race vs. Alcohol Involvement
chi2_alcohol, p_alcohol, dof_alcohol, ex_alcohol = chi2_contingency(contingency_alcohol)
print(f"Chi-squared test for Race vs. Alcohol Involvement: p-value = {p_alcohol}")

# Chi-squared test for Race vs. Sub-Agency
chi2_sub_agency, p_sub_agency, dof_sub_agency, ex_sub_agency = chi2_contingency(contingency_sub_agency)
print(f"Chi-squared test for Race vs. Sub-Agency: p-value = {p_sub_agency}")

# Chi-squared test for Race vs. Hour of Stop
chi2_hour, p_hour, dof_hour, ex_hour = chi2_contingency(contingency_hour)
print(f"Chi-squared test for Race vs. Hour of Stop: p-value = {p_hour}")

Chi-squared test for Race vs. Gender: p-value = 0.0
Chi-squared test for Race vs. Work Zone: p-value = 2.463036280003782e-07
Chi-squared test for Race vs. Alcohol Involvement: p-value = 3.611948664708815e-47
Chi-squared test for Race vs. Sub-Agency: p-value = 0.0
Chi-squared test for Race vs. Hour of Stop: p-value = 0.0

contingency_table = pd.crosstab(traf_df['Race'], traf_df['Violation Type'])
print(contingency_table)
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"p-value: {p}")

Violation Type   Citation  Warning
Race                              
ASIAN               26838    38876
BLACK              175876   176953
HISPANIC           129647   101815
NATIVE AMERICAN      1165     1450
OTHER               25700    35042
WHITE              182375   223256
p-value: 0.0

# Determine the features and labels
X = traf_df[['Alcohol',"Charge","Personal Injury", "Property Damage", "Contributed To Accident"]]

y = traf_df["Violation Type"].map({'Citation': 1, 'Warning': 0})

important_columns = ['Alcohol',"Personal Injury", "Property Damage", "Contributed To Accident"]

fig, axes = plt.subplots(len(important_columns),1, figsize=(14, 28))

for i, column_name in enumerate(important_columns):
    violation_counts = traf_df.groupby([column_name, 'Violation Type']).size().unstack(fill_value=0)
    violation_proportions = violation_counts.div(violation_counts.sum(axis=1), axis=0)
    
    violation_proportions.plot(kind='bar', stacked=True, ax=axes[i], color=['gold', 'teal'])
    axes[i].set_title('Violation Type vs ' + f"{column_name}")
    axes[i].set_xlabel(column_name)
    axes[i].set_ylabel('Proportion')
    axes[i].legend(title='Violation Type')
    axes[i].tick_params(axis='x', rotation=90)

plt.show()

traf_df = traf_df.copy()

traf_df["Charge"] = traf_df["Charge"].map(lambda x: re.sub(r'\(.*?\)', '', x))
traf_df["Charge"] = traf_df["Charge"].map(lambda x: re.sub(r'\..*', '', x))
traf_df["Charge"] = traf_df["Charge"].map(lambda x: re.sub(r'-', '', x))


X = X.copy()
X["Charge"] = traf_df["Charge"]

n = 50
top_n_results = X["Charge"].value_counts().sort_values(ascending=False)[:n-1]
mean = int(pd.Series(top_n_results.index.astype(int)).mean())

def charge_to_numerical(charge):
    if charge not in top_n_results:
        return mean
    return charge

X["Charge"] = X["Charge"].apply(charge_to_numerical)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalize or standardize the feature sets (both X_train and X_test)
# ANSWER STARTING HERE
scaler = StandardScaler()

scaler.fit(X_train)
X_train_mean = scaler.mean_
X_train_normalized = scaler.transform(X_train)

scaler.fit(X_test)
X_test_mean = scaler.mean_
X_test_normalized = scaler.transform(X_test)

# Print Results
print("Train mean:",X_train_mean,"\n")
print("X_train_normalized", X_train_normalized,"\n\n")
print("Test mean:",X_test_mean,"\n")
print("X_test_normalized", X_test_normalized)

Train mean: [1.67752890e-03 2.75316148e+04 1.17516389e-02 1.86545299e-02
 2.37969092e-02] 

X_train_normalized [[-0.04099204 -0.16091379  9.17030609 -0.13787362  6.40486252]
 [-0.04099204 -0.37086592 -0.10904761 -0.13787362  6.40486252]
 [-0.04099204 -0.3708134  -0.10904761 -0.13787362 -0.15613138]
 ...
 [-0.04099204 -0.30017285 -0.10904761 -0.13787362 -0.15613138]
 [-0.04099204  4.82121479 -0.10904761 -0.13787362 -0.15613138]
 [-0.04099204  0.09567686 -0.10904761 -0.13787362 -0.15613138]] 


Test mean: [1.62348301e-03 2.73856279e+04 1.17367396e-02 1.88979380e-02
 2.36611478e-02] 

X_test_normalized [[-0.04032522 -0.37020743 -0.10897764 -0.13878742 -0.15567455]
 [-0.04032522 -0.37020743 -0.10897764 -0.13878742 -0.15567455]
 [-0.04032522 -0.37020743 -0.10897764 -0.13878742 -0.15567455]
 ...
 [-0.04032522 -0.14516509 -0.10897764 -0.13878742 -0.15567455]
 [-0.04032522 -0.37020743 -0.10897764 -0.13878742 -0.15567455]
 [-0.04032522 -0.13717041 -0.10897764 -0.13878742 -0.15567455]]

model = RandomForestClassifier()

k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train_normalized, y_train, cv=skf, scoring='accuracy')
print(f"Randomforest, Cross-validation accuracy: {scores.mean():.4f} ± {scores.std():.4f}\n")

Randomforest, Cross-validation accuracy: 0.7004 ± 0.0009

model.fit(X_train_normalized, y_train)
y_pred = model.predict(X_test_normalized)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Randomforest, Accuaray: {accuracy:.4f}\nReport:")
print(report,"\n\n")

Randomforest, Accuaray: 0.6440
Report:
              precision    recall  f1-score   support

           0       0.63      0.76      0.69    173073
           1       0.67      0.52      0.59    162625

    accuracy                           0.64    335698
   macro avg       0.65      0.64      0.64    335698
weighted avg       0.65      0.64      0.64    335698

# Select the columns
traf_df['Violation Type'] = traf_df['Violation Type'].map({'Citation': 1, 'Warning': 0})
important_columns = ['Alcohol',"Personal Injury", "Property Damage", "Contributed To Accident", "Violation Type"]
importantcolmn = traf_df[important_columns]

# Calculate the correlation matrix
correlation_matrix = importantcolmn.corr()

# Plot the heatmap
plt.figure(figsize=(11, 8))
ax = sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
# Set x-axis labels to be at the top
ax.xaxis.set_ticks_position('top')
ax.xaxis.set_label_position('top')
plt.xticks(rotation=45, ha='left')
plt.yticks(rotation=0)
plt.title('Correlation Matrix', pad=20)
plt.show()

	Date Of Stop	Time Of Stop	Agency	SubAgency	Description	Location	Latitude	Longitude	Accident	Belts	...	Charge	Article	Contributed To Accident	Race	Gender	Driver City	Driver State	DL State	Arrest Type	Geolocation
0	09/24/2013	17:11:00	MCP	3rd district, Silver Spring	DRIVING VEHICLE ON HIGHWAY WITH SUSPENDED REGI...	8804 FLOWER AVE	NaN	NaN	No	No	...	13-401(h)	Transportation Article	No	BLACK	M	TAKOMA PARK	MD	MD	A - Marked Patrol	NaN
1	08/29/2017	10:19:00	MCP	2nd district, Bethesda	DRIVER FAILURE TO OBEY PROPERLY PLACED TRAFFIC...	WISCONSIN AVE@ ELM ST	38.981725	-77.092757	No	No	...	21-201(a1)	Transportation Article	No	WHITE	F	FAIRFAX STATION	VA	VA	A - Marked Patrol	(38.981725, -77.0927566666667)
2	12/01/2014	12:52:00	MCP	6th district, Gaithersburg / Montgomery Village	FAILURE STOP AND YIELD AT THRU HWY	CHRISTOPHER AVE/MONTGOMERY VILLAGE AVE	39.162888	-77.229088	No	No	...	21-403(b)	Transportation Article	No	BLACK	F	UPPER MARLBORO	MD	MD	A - Marked Patrol	(39.1628883333333, -77.2290883333333)
3	08/29/2017	09:22:00	MCP	3rd district, Silver Spring	FAILURE YIELD RIGHT OF WAY ON U TURN	CHERRY HILL RD./CALVERTON BLVD.	39.056975	-76.954633	No	No	...	21-402(b)	Transportation Article	No	BLACK	M	FORT WASHINGTON	MD	MD	A - Marked Patrol	(39.056975, -76.9546333333333)
4	08/28/2017	23:41:00	MCP	6th district, Gaithersburg / Montgomery Village	FAILURE OF DR. TO MAKE LANE CHANGE TO AVAIL. L...	355 @ SOUTH WESTLAND DRIVE	NaN	NaN	No	No	...	21-405(e1)	Transportation Article	No	WHITE	M	GAITHERSBURG	MD	MD	A - Marked Patrol	NaN

	Date Of Stop	Time Of Stop	Agency	SubAgency	Description	Location	Latitude	Longitude	Accident	Belts	...	Charge	Article	Contributed To Accident	Race	Gender	Driver City	Driver State	DL State	Arrest Type	Geolocation
1	2017-08-29	1900-01-01 10:19:00	MCP	2nd district, Bethesda	DRIVER FAILURE TO OBEY PROPERLY PLACED TRAFFIC...	WISCONSIN AVE@ ELM ST	38.981725	-77.092757	No	No	...	21-201(a1)	Transportation Article	No	WHITE	F	FAIRFAX STATION	VA	VA	A - Marked Patrol	(38.981725, -77.0927566666667)
2	2014-12-01	1900-01-01 12:52:00	MCP	6th district, Gaithersburg / Montgomery Village	FAILURE STOP AND YIELD AT THRU HWY	CHRISTOPHER AVE/MONTGOMERY VILLAGE AVE	39.162888	-77.229088	No	No	...	21-403(b)	Transportation Article	No	BLACK	F	UPPER MARLBORO	MD	MD	A - Marked Patrol	(39.1628883333333, -77.2290883333333)
3	2017-08-29	1900-01-01 09:22:00	MCP	3rd district, Silver Spring	FAILURE YIELD RIGHT OF WAY ON U TURN	CHERRY HILL RD./CALVERTON BLVD.	39.056975	-76.954633	No	No	...	21-402(b)	Transportation Article	No	BLACK	M	FORT WASHINGTON	MD	MD	A - Marked Patrol	(39.056975, -76.9546333333333)
6	2013-10-08	1900-01-01 13:23:00	MCP	4th district, Wheaton	DRIVING VEHICLE ON HIGHWAY WITH SUSPENDED REGI...	GEORGIA AVE / BEL PRE RD	39.093383	-77.079552	No	No	...	13-401(h)	Transportation Article	No	HISPANIC	M	BELTSVILLE	MD	MD	A - Marked Patrol	(39.0933833333333, -77.0795516666667)
10	2014-02-14	1900-01-01 20:10:00	MCP	1st district, Rockville	FAILURE TO DRIVE ON RIGHT HAND ROADWAY OF DIVI...	GATEWAY CENTER DR @ CLARKSBURG RD	39.234843	-77.281540	No	No	...	21-311(1)	Transportation Article	No	WHITE	M	POINT OF ROCK	MD	WV	A - Marked Patrol	(39.2348434333333, -77.28153995)

	Date Of Stop	Time Of Stop	SubAgency	Description	Location	Latitude	Longitude	Belts	Personal Injury	Property Damage	...	Model	Violation Type	Charge	Contributed To Accident	Race	Gender	Driver City	Driver State	DL State	Arrest Type
1	2017-08-29	1900-01-01 10:19:00	2nd district, Bethesda	DRIVER FAILURE TO OBEY PROPERLY PLACED TRAFFIC...	WISCONSIN AVE@ ELM ST	38.981725	-77.092757	0	0	0	...	COROLLA	Citation	21-201(a1)	0	WHITE	F	FAIRFAX STATION	VA	VA	A - Marked Patrol
2	2014-12-01	1900-01-01 12:52:00	6th district, Gaithersburg / Montgomery Village	FAILURE STOP AND YIELD AT THRU HWY	CHRISTOPHER AVE/MONTGOMERY VILLAGE AVE	39.162888	-77.229088	0	0	1	...	ACCORD	Citation	21-403(b)	0	BLACK	F	UPPER MARLBORO	MD	MD	A - Marked Patrol
3	2017-08-29	1900-01-01 09:22:00	3rd district, Silver Spring	FAILURE YIELD RIGHT OF WAY ON U TURN	CHERRY HILL RD./CALVERTON BLVD.	39.056975	-76.954633	0	0	1	...	DAKOTA	Citation	21-402(b)	0	BLACK	M	FORT WASHINGTON	MD	MD	A - Marked Patrol
6	2013-10-08	1900-01-01 13:23:00	4th district, Wheaton	DRIVING VEHICLE ON HIGHWAY WITH SUSPENDED REGI...	GEORGIA AVE / BEL PRE RD	39.093383	-77.079552	0	0	0	...	PICKUP	Citation	13-401(h)	0	HISPANIC	M	BELTSVILLE	MD	MD	A - Marked Patrol
10	2014-02-14	1900-01-01 20:10:00	1st district, Rockville	FAILURE TO DRIVE ON RIGHT HAND ROADWAY OF DIVI...	GATEWAY CENTER DR @ CLARKSBURG RD	39.234843	-77.281540	0	0	0	...	STS	Citation	21-311(1)	0	WHITE	M	POINT OF ROCK	MD	WV	A - Marked Patrol
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1291736	2012-11-12	1900-01-01 09:40:00	6th district, Gaithersburg / Montgomery Village	EXCEEDING THE POSTED SPEED LIMIT OF 35 MPH	CLOPPER ROAD AT ORCHARD RIDGE DRIVE	39.147593	-77.234835	0	0	0	...	SABLE	Warning	21-801.1	0	BLACK	F	GERMANTOWN	MD	MD	A - Marked Patrol
1291737	2012-11-12	1900-01-01 09:51:00	6th district, Gaithersburg / Montgomery Village	EXCEEDING THE POSTED SPEED LIMIT OF 35 MPH	CLOPPER ROAD AT ORCHARD RIDGE DRIVE	39.147572	-77.234841	0	0	0	...	ECLIPSE	Warning	21-801.1	0	WHITE	M	GERMANTOWN	MD	MD	Q - Marked Laser
1291738	2012-11-12	1900-01-01 10:21:00	6th district, Gaithersburg / Montgomery Village	EXCEEDING THE POSTED SPEED LIMIT OF 40 MPH	FREDERICK AVE AT PROFESSIONAL DR	39.162809	-77.225643	0	0	0	...	CIICIC	Warning	21-801.1	0	ASIAN	F	FREDERICK	MD	MD	Q - Marked Laser
1291739	2012-11-12	1900-01-01 10:34:00	6th district, Gaithersburg / Montgomery Village	EXCEEDING THE POSTED SPEED LIMIT OF 40 MPH	FREDERICK AVE AT PROFESSIONAL DR	39.161533	-77.223629	0	0	0	...	COOPER	Warning	21-801.1	0	WHITE	M	DAMASCUS	MD	MD	Q - Marked Laser
1291740	2012-11-12	1900-01-01 12:57:00	6th district, Gaithersburg / Montgomery Village	EXCEEDING THE POSTED SPEED LIMIT OF 30 MPH	EAST VILLAGE AVE AT BAY POINT PL	39.187870	-77.160296	0	0	0	...	ODYSSEY	Warning	21-801.1	0	BLACK	F	MONTGOMERY VILLAGE	MD	MD	Q - Marked Laser

	SubAgency	Description	Location	State	VehicleType	Make	Model	Violation Type	Charge	Race	Gender	Driver City	Driver State	DL State	Arrest Type
count	1119016	1119016	1119016	1119016	1119016	1119016	1119016	1119016	1119016	1119016	1119016	1119016	1119016	1119016	1119016
unique	7	11653	167471	69	32	3299	16125	3	943	6	3	6870	67	70	19
top	4th district, Wheaton	DRIVER FAILURE TO OBEY PROPERLY PLACED TRAFFIC...	IS 370 @ IS 270	MD	02 - Automobile	TOYOTA	4S	Warning	21-801.1	WHITE	M	SILVER SPRING	MD	MD	A - Marked Patrol
freq	285582	95013	1957	978219	987126	127787	123264	577392	153476	405636	746578	278652	1006703	969879	912452

Violation Type	Citation	Warning
Year Incident
2012	75079	60367
2013	83654	66887
2014	93100	94335
2015	96377	110066
2016	91433	101473
2017	75158	103365
2018	26800	40899

Maryland Traffic Violation Analysis¶

Summer 2024 Data Science Project¶

Introduction¶

Data Curation¶

Data preprocessing¶

Loading and understanding the Dataset¶

Data Cleanning¶

Exploratory data analysis And Visulization¶

Now let's see the distribution of violation type it will help us for further investgation.¶

Primary Analysis¶

Visualization¶

Insight and Conclusion¶

Violation Type	Citation	Warning
Hour
0	29473	30839
1	25701	22841
2	22038	14748
3	16513	7898
4	9821	4606
5	6136	3542
6	9477	6783
7	20678	19570
8	31085	30118
9	27151	28941
10	26105	31110
11	22545	25214
12	21710	22077
13	22009	22889
14	23753	22509
15	21383	18363
16	26663	28033
17	26896	30882
18	21646	23322
19	18743	20719
20	18699	24894
21	24374	37301
22	33603	51922
23	35399	48271