# Data processing and manipulation
import dask.dataframe as dd                         
import pandas as pd                                 
import numpy as np                                  
import re                                           

# Visualization libraries
import matplotlib.pyplot as plt                     
from matplotlib.ticker import StrMethodFormatter     
plt.rcParams['axes.grid'] = False
import seaborn as sns      
import folium
from folium.plugins import MarkerCluster                        

# Machine learning and preprocessing
from sklearn.preprocessing import LabelEncoder      

# Memory management
import gc

# Load the dataset as a Dask DataFrame
df = dd.read_parquet('/Users/er/Desktop/Data Analysis/Projects/Python/US Accidents/USTrafficAccidents/Data/Parquet/US_Accidents_March23.parquet')

# Compute dataset dimensions
num_rows, num_cols = df.shape[0].compute(), df.shape[1]

# Print dataset overview
print(f"Number of features (columns): {num_cols}")
print(f"Total accidents recorded (rows): {num_rows}")

# Display the first 5 rows of the DataFrame
df.head(5)

Number of features (columns): 46
Total accidents recorded (rows): 7728394

# Load the dataset as a Dask DataFrame
df = dd.read_parquet('/Users/er/Desktop/Data Analysis/Projects/Python/US Accidents/USTrafficAccidents/Data/Parquet/US_Accidents_March23.parquet')

# Compute the shape of the DataFrame
num_rows = df.shape[0].compute()                   # Total number of rows (accidents)
print(f"There are a total of {num_rows} accidents in the dataset's time range.")

There are a total of 7728394 accidents in the dataset's time range.

# Define a list of years for analysis
years = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

# Initialize total accident count and dictionary for yearly counts
total_count = 0
yearly_counts = {}

# Count occurrences of accidents per year
for year in years:
    count = df['Start_Time'].str.contains(str(year)).sum().compute()
    yearly_counts[year] = count
    total_count += count
    #print(f"Accidents in {year}: {count}")

# Print total accidents from 2016 to 2023
#print(f"Total accidents from 2016 to 2023: {total_count}")

# Plotting the results
plt.figure(figsize=(10, 6))
plt.bar(yearly_counts.keys(), yearly_counts.values(), color='lightsteelblue')

# Format the Y-axis for readability
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))

# Label axes and title
plt.xlabel('Year')
plt.ylabel('Number of Accidents')
plt.title('Accidents per Year (2016-2023)')
plt.xticks(years)

plt.tight_layout()  # Adjust layout
plt.show()

# List of years
years = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

# Initialize dictionary to hold total counts for each month
monthly_totals = {f"{str(month).zfill(2)}": 0 for month in range(1, 13)}

# For loop to count occurrences for each month across all years
for year in years:
    for month in range(1, 13):
        year_month = f"{year}-{str(month).zfill(2)}"  # Format as 'YYYY-MM'
        
        # Using Dask to compute the count
        count = (df['Start_Time'].str.contains(year_month).sum().compute())
        
        # Accumulate the count to the corresponding month
        monthly_totals[str(month).zfill(2)] += count

# Prepare data for plotting
months = [
    'January', 'February', 'March', 'April', 'May', 'June',
    'July', 'August', 'September', 'October', 'November', 'December'
]
values = list(monthly_totals.values())

# Plotting the total accidents per month
plt.figure(figsize=(12, 6))
plt.bar(months, values, color='lightblue')
plt.title('Cumulative Monthly Accident Trends (2016-2023)')
plt.xlabel('Months')
plt.ylabel('Number of Accidents')
plt.xticks(rotation=45, ha='right')
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))  # Format Y-axis
plt.tight_layout()
plt.show()

# List of years
years = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

months = [
    'January',
    'February',
    'March',
    'April',
    'May',
    'June',
    'July',
    'August',
    'September',
    'October',
    'November',
    'December'
]

# Initialize dictionary to hold values for each month across the years
monthly_trends = {f"{str(month).zfill(2)}": [] for month in range(1, 13)}

# For loop to count occurrences for each month across all years
for year in years:
    for month in range(1, 13):
        year_month = f"{year}-{str(month).zfill(2)}"  # Format as 'YYYY-MM'
        count = df['Start_Time'].str.contains(year_month).sum().compute()
        
        # Append the count to the corresponding month list
        monthly_trends[str(month).zfill(2)].append(count) 

# Create a 3x4 grid for the subplots (3 rows and 4 columns)
fig, axs = plt.subplots(3, 4, figsize=(20, 15))
axs = axs.flatten()  # Flatten the 2D array of axes for easy indexing

# Loop through each month in monthly_trends to generate bar charts
for i, month in enumerate(monthly_trends):
    axs[i].bar(years, monthly_trends[month], color='skyblue')
    
    # Get the month name from the months list based on the month number
    month_index = int(month) - 1  # Convert to 0-based index
    axs[i].set_title(f"{months[month_index]} Trends in Traffic Accidents (2016-2023)")
    axs[i].set_xlabel("Year")
    axs[i].set_ylabel("Number of Accidents")
    axs[i].set_xticks(years)  # Ensure all years are labeled on the x-axis

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

# Ensure 'Start_Time' is in datetime format, using mixed format to handle variations
df['Start_Time'] = dd.to_datetime(df['Start_Time'], format='mixed', errors='coerce')

# Extract only the date (removes the time)
df['date_only'] = df['Start_Time'].dt.date

# Extract the day of the week (e.g., 'Monday', 'Tuesday') and store in 'weekday' column
df['weekday'] = df['Start_Time'].dt.day_name()

# Create a new DataFrame with only the 'weekday' column
weekday_df = df[['weekday']]

# Count how many times each day appears and sort by counts in descending order
weekday_counts = weekday_df['weekday'].value_counts().compute().sort_values(ascending=False)

# Print the counts for each day
#print("Counts of each weekday:")
#print(weekday_counts)

# Plotting the counts
plt.figure(figsize=(10, 6))
weekday_counts.plot(kind='bar', color='skyblue')
plt.title('Count of Accidents by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Count of Accidents')
plt.xticks(rotation=45)

# Set the y-axis limit to start from 0 and end at the maximum count
plt.ylim(0, weekday_counts.max() * 1.1)  # Optional padding above the max count

# Format the Y-axis for readability
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{int(x):,}'))

plt.tight_layout()  # Adjust layout
plt.show()

# Ensure 'Start_Time' is in datetime format
df['Start_Time'] = dd.to_datetime(df['Start_Time'], format='mixed', errors='coerce')

# Extract the hour from 'Start_Time'
df['hour'] = df['Start_Time'].dt.hour

# Define a function to categorize the time of day
def categorize_time_of_day(hour):
    if 0 <= hour < 6:
        return 'Night'
    elif 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'  # For hours 21 and 24

# Apply the categorization function using Dask's map_partitions
df['time_of_day'] = df['hour'].map(categorize_time_of_day, meta=('x', 'object'))

# Count how many times each time of day appears and sort by counts in descending order
time_of_day_counts = df['time_of_day'].value_counts().compute().sort_values(ascending=False)

# Print the counts for each time of day
print("Counts of each time of day:")
print(time_of_day_counts)

# Plotting the counts
plt.figure(figsize=(10, 6))
time_of_day_counts.plot(kind='bar', color='skyblue')
plt.title('Count of Accidents by Time of Day')
plt.xlabel('Time of Day')
plt.ylabel('Count of Accidents')
plt.xticks(rotation=45)
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{int(x):,}'))  # Format y-axis with commas
plt.tight_layout()  # Adjust layout
plt.show()

Counts of each time of day:
time_of_day
Afternoon    2884131
Morning      2631665
Night        1260209
Evening       952389
Name: count, dtype: int64

# Check if 'State' exists in the DataFrame
if 'State' not in df.columns:
    raise ValueError("The column 'State' does not exist in the DataFrame.")

# Get unique state names and check for duplicates
unique_states = df['State'].unique().compute()
has_duplicates = len(unique_states) != len(set(unique_states))

# Count accidents for each state using groupby and sort in descending order
state_counts = df['State'].value_counts().compute().sort_values(ascending=False)

# Get the top 5 and least 5 states
top_5_states = state_counts.head(5)
least_5_states = state_counts.tail(5)

# Print the total number of accidents
# total_accidents = state_counts.sum()
# print(f"Total number of accidents: {total_accidents}")

# Print the top 5 and least 5 states
# print("Top 5 States with the Most Accidents:")
# for state, count in top_5_states.items():
#     print(f"{state}: {count}")

# print("\nBottom 5 States with the Least Accidents:")
# for state, count in least_5_states.items():
#     print(f"{state}: {count}")

# Plotting the results for all states
plt.figure(figsize=(12, 8))
plt.bar(state_counts.index, state_counts.values, color='lightsteelblue')
plt.title('Accidents per State', fontsize=14)
plt.xlabel('States', fontsize=12)
plt.ylabel('Number of Accidents', fontsize=12)
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Plotting for Top 5 States
plt.figure(figsize=(10, 6))
plt.bar(top_5_states.index, top_5_states.values, color='lightsteelblue')
plt.title('Top 5 States with Most Accidents', fontsize=14)
plt.xlabel('States', fontsize=12)
plt.ylabel('Number of Accidents', fontsize=12)
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plotting for Least 5 States
plt.figure(figsize=(10, 6))
plt.bar(least_5_states.index, least_5_states.values, color='salmon')
plt.title('States with Least Accidents', fontsize=14)
plt.xlabel('States', fontsize=12)
plt.ylabel('Number of Accidents', fontsize=12)
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Count accidents for each city
city_counts = df['City'].value_counts().compute()  # Get counts directly

# Sort the city counts in descending order
sorted_city_counts = city_counts.sort_values(ascending=False)

# Get the top 5 cities with the most incidents
top_5_cities = sorted_city_counts.head(5)

# Get the bottom 5 cities with the least incidents
least_5_cities = sorted_city_counts.tail(5)

# Print the results
print("The first 5 cities with the most incidents are:")
for city, count in top_5_cities.items():
    print(f"{city} ({count})")

print("\nThe cities with the least incidents are:")
for city, count in least_5_cities.items():
    print(f"{city} ({count})")

# Check for duplicates
has_duplicates = df['City'].nunique() < len(df['City'])

# Print sorted results
print("\nHas duplicates:", has_duplicates)

# Plotting the top 5 cities
plt.figure(figsize=(12, 6))
plt.bar(top_5_cities.index, top_5_cities.values, color='lightsteelblue')
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
plt.xlabel('Cities')
plt.ylabel('Number of Accidents')
plt.title('Top 5 Cities with Most Accidents')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Plotting the least 5 cities
plt.figure(figsize=(12, 6))
plt.bar(least_5_cities.index, least_5_cities.values, color='lightcoral')
plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))
plt.xlabel('Cities')
plt.ylabel('Number of Accidents')
plt.title('Least 5 Cities with Fewest Accidents')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

The first 5 cities with the most incidents are:
Miami (186917)
Houston (169609)
Los Angeles (156491)
Charlotte (138652)
Dallas (130939)

The cities with the least incidents are:
Willow City (1)
Window Rock (1)
Wingina (1)
Yeso (1)
Young (1)

Has duplicates: <dask_expr.expr.Scalar: expr=(DropDuplicates(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=Assign(frame=ReadParquetFSSpec(c4b3c72)))))))['City'], split_every=False)).count() < 7728394, dtype=bool>

# Dictionary to map state abbreviations to full names
state_names = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California',
    'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia',
    'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
    'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts',
    'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana',
    'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico',
    'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
    'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
    'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont',
    'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'
}
# Group by 'State' and 'Severity' and count the occurrences of each
state_severity_counts = df.groupby(['State', 'Severity']).size()

# Compute the result
result = state_severity_counts.compute()

# Convert to a Pandas DataFrame for easier manipulation
result_df = result.reset_index(name='Count')

# Filter the DataFrame for Severity 4 (Critical Incidents)
severity_4_df = result_df[result_df['Severity'] == 4]

# Sort by 'Count' and get the top 5 states
top_5_states_severity_4 = severity_4_df.sort_values(by='Count', ascending=False).head(5)

# Assign different colors to each state using the updated method
colors = plt.colormaps.get_cmap('Set1')(np.linspace(0, 1, len(top_5_states_severity_4)))

# Plot the top 5 states with different colors
plt.figure(figsize=(10, 6))
bars = plt.bar(top_5_states_severity_4['State'], top_5_states_severity_4['Count'], color=colors)

# Set plot title and labels
plt.title('States with the Highest Accident Severity Category 4 (Critical Incident)')
plt.xlabel('State')
plt.ylabel('Count of Critical Incidents (Severity 4)')
plt.xticks(rotation=45)

# Create the legend based on the top 5 states
state_legend = {abbr: state_names[abbr] for abbr in top_5_states_severity_4['State']}

# Create colored legend
for i, bar in enumerate(bars):
    plt.text(1.05, 0.9 - (i * 0.1), f"{top_5_states_severity_4['State'].iloc[i]}: {state_legend[top_5_states_severity_4['State'].iloc[i]]}",
             transform=plt.gca().transAxes, fontsize=10, verticalalignment='center', color=bar.get_facecolor())

# Add space for the legend
plt.tight_layout()

# Show plot
plt.show()

# Load the Dask DataFrame
df = dd.read_parquet('/Users/er/Desktop/Website Esteban/errosal.github.io/Python/US_Traffic_Accidents/US_Accidents_March23.parquet')

# Filter relevant columns and rows for clustering (Severity greater than 3)
df_filtered = df[['Start_Lat', 'Start_Lng', 'Severity']]
df_filtered = df_filtered[df_filtered['Severity'] > 3]  # Only severity greater than 3

# Compute the filtered DataFrame
df_computed = df_filtered.compute()

# Step 1: Count accidents by severity (for severity > 3)
severity_counts = df_computed['Severity'].value_counts()
print("Accident Count by Severity (Severity > 3):")
print(severity_counts)

# Step 2: Identify the severity level with the most accidents
max_severity = severity_counts.idxmax()

# Initialize a map centered around the average location of the accidents
avg_lat = df_computed['Start_Lat'].mean()
avg_lng = df_computed['Start_Lng'].mean()
map_cluster = folium.Map(location=[avg_lat, avg_lng], zoom_start=5)

# Initialize the marker cluster
marker_cluster = MarkerCluster().add_to(map_cluster)

# Step 3: Add points to the map and color code them
for idx, row in df_computed.iterrows():
    # Default marker color
    marker_color = 'blue'
    
    # Set color to red if it's the severity with the highest count
    if row['Severity'] == max_severity:
        marker_color = 'red'
    
    # Add marker to the map
    folium.Marker(
        location=[row['Start_Lat'], row['Start_Lng']],  # Corrected index here
        popup=f'Severity: {row["Severity"]}',
        icon=folium.Icon(color=marker_color)
    ).add_to(marker_cluster)

# Save and display the map
map_cluster.save('accident_cluster_map.html')
map_cluster

df = dd.read_parquet('/Users/er/Desktop/Data Analysis/Projects/Python/US Accidents/USTrafficAccidents/Data/Parquet/US_Accidents_March23.parquet') 

df['hour'] = df['Start_Time'].str.slice(11, 13)

hourly_unique_counts = df.groupby('hour')['ID'].nunique().compute()

hourly_unique_counts_sorted = hourly_unique_counts.sort_index(ascending=True)

print(hourly_unique_counts_sorted)

total_unique_counts = hourly_unique_counts_sorted.sum()
print("Total unique accidents:", total_unique_counts)

plt.figure(figsize=(10, 6))
hourly_unique_counts_sorted.plot(kind='bar', color='skyblue')

# Adding titles and labels
plt.title('Accidents by Hour (Unique Counts)', fontsize=16)
plt.xlabel('Hour of the Day (00-23)', fontsize=12)
plt.ylabel('Number of Unique Accidents', fontsize=12)

# Show the plot
plt.xticks(rotation=0)  # Keep x-axis labels horizontal
plt.show()

hour
00    112378
01     97071
02     93227
03     83863
04    159852
05    228182
06    405837
07    587472
08    577576
09    363034
10    342706
11    355040
12    355001
13    396445
14    448846
15    525855
16    581969
17    576015
18    432042
19    295121
20    225226
21    191452
22    167645
23    126539
Name: ID, dtype: int64
Total unique accidents: 7728394

import seaborn as sns
import matplotlib.pyplot as plt

# Set the Seaborn style
sns.set(style='whitegrid')

# Create a bar plot with Seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x=hourly_unique_counts_sorted.index, y=hourly_unique_counts_sorted.values, color='skyblue')

# Adding titles and labels
plt.title('Accidents by Hour (Unique Counts)', fontsize=16)
plt.xlabel('Hour of the Day (00-23)', fontsize=12)
plt.ylabel('Number of Unique Accidents', fontsize=12)

# Highlight morning rush hour (7 AM to 9 AM)
plt.axvspan(7, 9, color='yellow', alpha=0.3, label='Morning Rush Hour (7 AM - 9 AM)')

# Highlight evening rush hour (4 PM to 6 PM)
plt.axvspan(16, 18, color='orange', alpha=0.3, label='Evening Rush Hour (4 PM - 6 PM)')

# Add a legend to explain the shaded areas
plt.legend()

# Show the plot
plt.xticks(rotation=0)  # Keep x-axis labels horizontal
plt.show()

# Assuming df is your Dask DataFrame with 'Start_Time' column
# List of years
years = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

months = [
    'January', 'February', 'March', 'April', 'May', 'June', 
    'July', 'August', 'September', 'October', 'November', 'December'
]

# Initialize a dictionary to hold values for each month across the years
monthly_trends = {year: {month: 0 for month in months} for year in years}

# For loop to count occurrences for each month across all years
for year in years:
    for month in range(1, 13):
        year_month = f"{year}-{str(month).zfill(2)}"  # Format as 'YYYY-MM'
        count = df['Start_Time'].str.contains(year_month).sum().compute()
        
        # Store the count in the corresponding year and month
        monthly_trends[year][months[month - 1]] = count

# Set up the bar chart for each year
for year in years:
    counts = [monthly_trends[year][month] for month in months]
    
    # Calculate percentage changes for the current year
    percentage_changes = [0]  # Start with 0% for the first month
    for i in range(1, len(counts)):
        if counts[i-1] != 0:  # Avoid division by zero
            percentage_change = ((counts[i] - counts[i-1]) / counts[i-1]) * 100
        else:
            percentage_change = 0  # Set to 0 if previous month's count is 0
        percentage_changes.append(percentage_change)

    # Plotting
    plt.figure(figsize=(10, 6))
    x = np.arange(len(months))  # The x locations for the groups
    bar_width = 0.4  # Width of the bars

    plt.bar(x, counts, width=bar_width, color='skyblue', label='Accidents')
    plt.title(f"Monthly Accidents in {year}", fontsize=16)
    plt.xlabel("Month", fontsize=12)
    plt.ylabel("Number of Accidents", fontsize=12)
    plt.xticks(x, months, rotation=45)  # Rotate month labels for better readability
    plt.ylim(0, max(counts) * 1.1)  # Set y-limit for better visualization
    plt.legend(title='Metrics')

    # Adding percentage change annotations
    for month_index in range(1, len(months)):
        if counts[month_index - 1] != 0:  # Avoid division by zero
            plt.annotate(f"{percentage_changes[month_index]:.2f}%", 
                         xy=(x[month_index], counts[month_index]), 
                         ha='center', 
                         va='bottom', 
                         color='red', fontsize=10)

    plt.tight_layout()
    plt.show()

# Group by severity level

# Load the dataset
df = dd.read_parquet('/Users/er/Desktop/Data Analysis/Projects/Python/US Accidents/USTrafficAccidents/Data/Parquet/US_Accidents_March23.parquet')  

severity_counts = df['Severity'].value_counts().compute()
total_severity = severity_counts.sum()

print(severity_counts)
print(total_severity)

# Get the counts of each severity level
severity_counts = df['Severity'].value_counts().compute()

# Map severity numbers to descriptive names
severity_labels = {
    1: 'Minor Incident',
    2: 'Moderate Incident',
    3: 'Severe Incident',
    4: 'Fatal Incident',
}

# Update index to descriptive names
severity_counts.index = severity_counts.index.map(severity_labels)

# Create a categorical index with the specified order
ordered_severity = ['Minor Incident', 'Moderate Incident', 'Severe Incident', 'Fatal Incident']
severity_counts = severity_counts.reindex(ordered_severity)

# Adjust the scale (convert counts to millions)
severity_counts_millions = severity_counts / 1_000_000

# Plot the distribution
plt.figure(figsize=(8, 5))
severity_counts_millions.plot(kind='bar', color='skyblue')
plt.title('Distribution of Accidents by Severity Levels')
plt.xlabel('Severity Level')
plt.ylabel('Number of Accidents (in millions)')
plt.xticks(rotation=45)
plt.grid(axis='y')

# Set y-ticks for better readability
plt.yticks([i for i in range(0, int(severity_counts_millions.max()) + 2)], 
           [f'{i}M' for i in range(0, int(severity_counts_millions.max()) + 2)])

# Show the plot
plt.tight_layout()
plt.show()

Severity
3    1299337
1      67366
2    6156981
4     204710
Name: count, dtype: int64
7728394

# Load the dataset
df = dd.read_parquet('/Users/er/Desktop/Data Analysis/Projects/Python/US Accidents/USTrafficAccidents/Data/Parquet/US_Accidents_March23.parquet')

# Select relevant columns for analysis
columns_of_interest = [
    'Severity', 
    'Temperature(F)', 
    'Humidity(%)', 
    'Wind_Speed(mph)', 
    'Precipitation(in)', 
    'Distance(mi)',         
    'Traffic_Signal',        
]

# Filter the DataFrame to include only relevant columns
df_filtered = df[columns_of_interest]

# Drop rows with NaN values in the selected columns
df_filtered = df_filtered.dropna()

# Filter the DataFrame to include only rows where Severity equals 4
df_severity_1 = df_filtered[df_filtered['Severity']==4]

# Compute the filtered Dask DataFrame to convert it to a Pandas DataFrame
df_severity_1 = df_severity_1.compute()

# Calculate the correlation matrix for the filtered DataFrame
correlation_matrix = df_severity_1.corr()

# Print the correlation matrix
print("Correlation Matrix for Severity Level 1:")
print(correlation_matrix)

# Create a heatmap for the correlation matrix
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', cbar=True, square=True)
plt.title('Heatmap of Correlation Matrix for Severity Level 1 Accidents')
plt.show()

# Print the first few rows (now a Pandas DataFrame)
print(df_severity_1.head())

# Print the number of rows in the filtered DataFrame for Severity 1
print(f"Number of rows with Severity level 1: {df_severity_1.shape[0]}")

Correlation Matrix for Severity Level 1:
                   Severity  Temperature(F)  Humidity(%)  Wind_Speed(mph)  \
Severity                NaN             NaN          NaN              NaN   
Temperature(F)          NaN        1.000000    -0.288157        -0.031746   
Humidity(%)             NaN       -0.288157     1.000000        -0.205287   
Wind_Speed(mph)         NaN       -0.031746    -0.205287         1.000000   
Precipitation(in)       NaN       -0.000622     0.081820         0.026067   
Distance(mi)            NaN       -0.025354     0.012569         0.036061   
Traffic_Signal          NaN        0.029288    -0.044182         0.018568   

                   Precipitation(in)  Distance(mi)  Traffic_Signal  
Severity                         NaN           NaN             NaN  
Temperature(F)             -0.000622     -0.025354        0.029288  
Humidity(%)                 0.081820      0.012569       -0.044182  
Wind_Speed(mph)             0.026067      0.036061        0.018568  
Precipitation(in)           1.000000      0.002701        0.001823  
Distance(mi)                0.002701      1.000000       -0.086258  
Traffic_Signal              0.001823     -0.086258        1.000000

        Severity  Temperature(F)  Humidity(%)  Wind_Speed(mph)  \
14035          4            63.0         70.0             13.8   
58391          4            59.0         93.0              5.8   
133648         4            89.1         63.0             12.7   
135764         4            75.0         94.0             17.3   
140384         4            80.1         85.0             10.4   

        Precipitation(in)  Distance(mi)  Traffic_Signal  
14035                0.00          0.01           False  
58391                0.01          0.01           False  
133648               0.00          0.00           False  
135764               0.27          0.00           False  
140384               0.00          0.00           False  
Number of rows with Severity level 1: 132126

Feature	Description
`ID`	Unique identifier for each accident
`Start_Time`	The start time of the accident
`End_Time`	The end time of the accident
`Severity`	Accident severity level (1 = Minor, 4 = Fatal)
`State`	The U.S. state where the accident occurred
`City`	The city where the accident occurred
`Weather_Condition`	Weather conditions during the accident
`Visibility`	Visibility at the time of the accident (in miles)
`Temperature`	Temperature at the time of the accident (in Fahrenheit)
`Start_Lat`	Latitude of the start point (GPS coordinates)
`Start_Lng`	Longitude of the start point (GPS coordinates)

Accident Analytics: A Deep Dive into US Traffic Data¶

Introducing the Dataset¶

Import Libraries and Load the Data¶

General Trends and Exploratory Data Analysis¶

Geographic Distribution of Accidents¶

Conclusion¶

	ID	Source	Severity	Start_Time	End_Time	Start_Lat	Start_Lng	End_Lat	End_Lng	Distance(mi)	...	Roundabout	Station	Stop	Traffic_Calming	Traffic_Signal	Turning_Loop	Sunrise_Sunset	Civil_Twilight	Nautical_Twilight	Astronomical_Twilight
0	A-1	Source2	3	2016-02-08 05:46:00	2016-02-08 11:00:00	39.865147	-84.058723	NaN	NaN	0.01	...	False	False	False	False	False	False	Night	Night	Night	Night
1	A-2	Source2	2	2016-02-08 06:07:59	2016-02-08 06:37:59	39.928059	-82.831184	NaN	NaN	0.01	...	False	False	False	False	False	False	Night	Night	Night	Day
2	A-3	Source2	2	2016-02-08 06:49:27	2016-02-08 07:19:27	39.063148	-84.032608	NaN	NaN	0.01	...	False	False	False	False	True	False	Night	Night	Day	Day
3	A-4	Source2	3	2016-02-08 07:23:34	2016-02-08 07:53:34	39.747753	-84.205582	NaN	NaN	0.01	...	False	False	False	False	False	False	Night	Day	Day	Day
4	A-5	Source2	2	2016-02-08 07:39:07	2016-02-08 08:09:07	39.627781	-84.188354	NaN	NaN	0.01	...	False	False	False	False	True	False	Day	Day	Day	Day