import xml.etree.ElementTree as et
xtree = et.parse("AviationData.xml")


import pandas as pd
root = xtree.getroot()


rows = []
for child1 in root:
  for child in child1:
    rows.append(child.attrib)


rows
pd.set_option('display.max_columns', None)
aviation = pd.DataFrame(rows)

aviation.head(5)


aviation[aviation['Location'] == 'Teterboro, NJ'].head(5)


import numpy as np
aviation['TotalFatalInjuries'] = aviation['TotalFatalInjuries'].replace('\s+',np.nan,regex=True).replace('',np.nan).astype(float)
aviation['TotalSeriousInjuries'] = aviation['TotalSeriousInjuries'].replace('\s+',np.nan,regex=True).replace('',np.nan).astype(float)
aviation['TotalMinorInjuries'] = aviation['TotalMinorInjuries'].replace('\s+',np.nan,regex=True).replace('',np.nan).astype(float)
aviation['TotalUninjured'] = aviation['TotalUninjured'].replace('\s+',np.nan,regex=True).replace('',np.nan).astype(float)
aviation['Latitude'] = aviation['Latitude'].replace('\s+',np.nan,regex=True).replace('',np.nan).astype(float)
aviation['Longitude'] = aviation['Longitude'].replace('\s+',np.nan,regex=True).replace('',np.nan).astype(float)
aviation['PublicationDate'] = aviation['PublicationDate'].replace('\s+',np.nan,regex=True).replace('',np.nan)


# Convert date to datetime
aviation['EventDatetime'] = pd.to_datetime(aviation['EventDate'], format="%m/%d/%Y")
aviation['PublicationDatetime'] = pd.to_datetime(aviation['PublicationDate'], format="%m/%d/%Y")
aviation['Year'] = pd.DatetimeIndex(aviation['EventDatetime']).year 
aviation['Month'] = pd.DatetimeIndex(aviation['EventDatetime']).month


# Number of crashes per year

import matplotlib.pyplot as plt
%matplotlib inline
spelled_months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
years = aviation['Year'].value_counts().sort_index()
#months.plot.bar( x='spelled_month')
years = years.to_frame()
years.plot.bar( )

<AxesSubplot:>


# Number of crashes per month
import matplotlib.pyplot as plt
%matplotlib inline
spelled_months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

months = aviation['Month'].value_counts().sort_index()
#months.plot.bar( x='spelled_month')
months = months.to_frame()
months['Spelled_Month'] = spelled_months
months
months.plot.bar( x='Spelled_Month')
days_per_month = [31,28,31,30,31,30,31,31,30,31,30,31]
for row_index, row in months.iterrows():
    months.at[row_index, "Month"] = (months.at[row_index, "Month"] / days_per_month[row_index-1])
months.plot.bar( x='Spelled_Month')

<AxesSubplot:xlabel='Spelled_Month'>


with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(aviation.head(5))


# Convert the Brand Names to alll lowecase
aviation['MakeLower'] = aviation['Make'].str.lower()
aviation['MakeLower'].value_counts().head(8)

cessna      21214
piper       11425
beech        4103
bell         2043
boeing       1488
grumman      1064
mooney       1033
bellanca      870
Name: MakeLower, dtype: int64


# Violin Plot of the most common brands
#aviation[aviation['Make']=="Boeing"]

import seaborn as sns
violin = aviation[((~(np.isnan(aviation['TotalFatalInjuries'])))
                  & (((aviation['TotalFatalInjuries']) > 0))
                  & (((aviation['TotalFatalInjuries']) < 15))
                  & ( ( aviation['MakeLower'] == 'cessna')
                     |( aviation['MakeLower'] == 'piper')
                     | ( aviation['MakeLower'] == 'beech')
                     |( aviation['MakeLower'] == 'bell')
                     |( aviation['MakeLower'] == 'boeing')
                     | ( aviation['MakeLower'] == 'grumman')
                     |( aviation['MakeLower'] == 'mooney')
                     |( aviation['MakeLower'] == 'bellanca')
                    )                                            )]

# Make the plot larger
import matplotlib.pyplot as plt
fig = plt.gcf()
fig.set_size_inches(12, 8)

sns.violinplot(x='MakeLower', y='TotalFatalInjuries', data=violin)

<AxesSubplot:xlabel='MakeLower', ylabel='TotalFatalInjuries'>


import folium
map_osm = folium.Map(location=[39.29, -76.61], zoom_start=3)

#               RED      Orange   Yellow      Gray      Black
my_colors = ["#ff0000","#ffb300","#fff200","#a19f82","#0a0a0a"]

# Iterate over the crashes that have a recorded latitude and longitude
for row_index, row in aviation[(~(np.isnan(aviation["Latitude"].astype(float)))) 
                               & (~(np.isnan(aviation["Longitude"].astype(float)))) 
                          & ((~(aviation["TotalFatalInjuries"] == 0 )) & (~(np.isnan(aviation["TotalFatalInjuries"].astype(float)))))].iterrows():
  color = my_colors[0]
  folium.CircleMarker(
            location=[row["Latitude"], row["Longitude"]],
            radius=1,
            color=color,
            fill=False).add_to(map_osm)


map_osm


aviation["AircraftDamage"].value_counts()

Substantial    44523
Destroyed      15849
Minor           1896
                1645
Name: AircraftDamage, dtype: int64


# Number of fatalities compared to the time spent investigating the crash

import matplotlib.pyplot as plt
%matplotlib inline

aviation["DaysInvestigating"] = (aviation['PublicationDatetime'] - aviation['EventDatetime']) / np.timedelta64(1, 'D')

scatterplot = aviation[ (~(np.isnan(aviation['TotalFatalInjuries'].astype(float))))
                        & (~(aviation['TotalFatalInjuries'].astype(float) < 1))
                        & (~(np.isnan(aviation['DaysInvestigating'].astype(float))))]
scatter_title = "Number of Fatalities Compared to Investigation Time (Days)"
scatterplot.plot.scatter(x='DaysInvestigating', y='TotalFatalInjuries', title=scatter_title)

<AxesSubplot:title={'center':'Number of Fatalities Compared to Investigation Time (Days)'}, xlabel='DaysInvestigating', ylabel='TotalFatalInjuries'>


data = scatterplot
from sklearn import linear_model

#Given the amount of days investigating, we want to predict total fatalities
X=data["DaysInvestigating"].values[:,np.newaxis]
y=data["TotalFatalInjuries"].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
print("Y Intercept: "+str(reg.intercept_))
print("\nSlope:       "+str(reg.coef_)) #slope

Y Intercept: 2.5545940993385527

Slope:       [0.00052742]


# We are given the number of days investigating, and we ant to predict the total number of fatalities
y_pred = reg.predict(X_test)

#Compare actual values with the linear regression's estimate:
pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()}).head(10)

	EventId	InvestigationType	AccidentNumber	EventDate	Location	Country	Latitude	Longitude	AirportCode	AirportName	InjurySeverity	AircraftDamage	AircraftCategory	RegistrationNumber	Make	Model	AmateurBuilt	NumberOfEngines	EngineType	FARDescription	PurposeOfFlight	TotalFatalInjuries	TotalSeriousInjuries	TotalMinorInjuries	TotalUninjured	WeatherCondition	BroadPhaseOfFlight	ReportStatus	PublicationDate
0	20080125X00106	Accident	SEA08CA056	12/31/2007	Santa Ana, CA	United States	33.675556	-117.868056	SNA	John Wayne - Orange County	Non-Fatal	Substantial	Airplane	N2800D	Piper	PA-12	No	1	Reciprocating	Part 91: General Aviation	Instructional				2	VMC	LANDING	Probable Cause	02/28/2008
1	20080206X00141	Accident	CHI08WA075	12/31/2007	Guernsey, United Kingdom	United Kingdom	49.435000	-2.600278			Non-Fatal	Minor			Cessna	T303	No				Unknown				1			Foreign	02/06/2008
2	20080109X00032	Accident	NYC08FA071	12/30/2007	Cherokee, AL	United States	34.688611	-87.920000			Fatal(3)	Substantial	Helicopter	N109AE	BELL	206L-3	No	1	Turbo Shaft	Part 91: General Aviation	Other Work Use	3	0	0	0	VMC	MANEUVERING	Probable Cause	01/15/2009
3	20080114X00045	Accident	LAX08FA043	12/30/2007	Paso Robles, CA	United States	35.542222	-120.522778	PRB	Paso Robles Airport	Fatal(1)	Substantial	Airplane	N254SR	Cirrus Design Corp.	SR22	No	1	Reciprocating	Part 91: General Aviation	Personal	1				VMC	MANEUVERING	Probable Cause	06/20/2014
4	20080129X00122	Accident	CHI08CA057	12/30/2007	Alexandria, MN	United States	45.866111	-95.394444	AXN	Chandler Field Airport	Non-Fatal	Substantial	Airplane	N5093F	Lerohl	RV-8	Yes	1	Reciprocating	Part 91: General Aviation	Personal				1	VMC	TAKEOFF	Probable Cause	02/28/2008

	EventId	InvestigationType	AccidentNumber	EventDate	Location	Country	Latitude	Longitude	AirportCode	AirportName	InjurySeverity	AircraftDamage	RegistrationNumber	Make	Model	AmateurBuilt	NumberOfEngines	EngineType	Schedule	PurposeOfFlight	TotalUninjured	WeatherCondition	BroadPhaseOfFlight	ReportStatus	PublicationDate
13	20080128X00107	Incident	OPS08IA004B	12/27/2007	Teterboro, NJ	United States					Incident			Raytheon Corporate Jets	H25B	No			NSCH		2	VMC	TAXI	Probable Cause	03/31/2008
14	20080128X00107	Incident	OPS08IA004A	12/27/2007	Teterboro, NJ	United States					Incident			Gulfstream American	G5	No				Unknown	2	VMC	TAKEOFF	Probable Cause	03/31/2008
451	20071011X01543	Accident	ATL07LA130	09/28/2007	Teterboro, NJ	United States	40.850000	-74.061667	TEB	TETERBORO	Non-Fatal	Substantial	N90TH	Dassault Aviation	Mystere Falcon 900	No	3	Turbo Fan		Executive/Corporate	7	VMC	TAXI	Probable Cause	07/30/2008
746	20070823X01229	Incident	OPS07IA008B	08/13/2007	Teterboro, NJ	United States			KTEB	Teterboro	Incident			Cessna	CL60	No				Personal	4		LANDING	Probable Cause	11/30/2007
747	20070823X01229	Incident	OPS07IA008A	08/13/2007	Teterboro, NJ	United States			KTEB	Teterboro	Incident			Learjet	45	No				Personal	4		LANDING	Probable Cause	11/30/2007

	EventId	InvestigationType	AccidentNumber	EventDate	Location	Country	Latitude	Longitude	AirportCode	AirportName	InjurySeverity	AircraftDamage	AircraftCategory	RegistrationNumber	Make	Model	AmateurBuilt	NumberOfEngines	EngineType	FARDescription	PurposeOfFlight	TotalFatalInjuries	TotalSeriousInjuries	TotalMinorInjuries	TotalUninjured	WeatherCondition	BroadPhaseOfFlight	ReportStatus	PublicationDate	EventDatetime	PublicationDatetime	Year	Month
0	20080125X00106	Accident	SEA08CA056	12/31/2007	Santa Ana, CA	United States	33.675556	-117.868056	SNA	John Wayne - Orange County	Non-Fatal	Substantial	Airplane	N2800D	Piper	PA-12	No	1	Reciprocating	Part 91: General Aviation	Instructional	NaN	NaN	NaN	2.0	VMC	LANDING	Probable Cause	02/28/2008	2007-12-31	2008-02-28	2007	12
1	20080206X00141	Accident	CHI08WA075	12/31/2007	Guernsey, United Kingdom	United Kingdom	49.435000	-2.600278			Non-Fatal	Minor			Cessna	T303	No				Unknown	NaN	NaN	NaN	1.0			Foreign	02/06/2008	2007-12-31	2008-02-06	2007	12
2	20080109X00032	Accident	NYC08FA071	12/30/2007	Cherokee, AL	United States	34.688611	-87.920000			Fatal(3)	Substantial	Helicopter	N109AE	BELL	206L-3	No	1	Turbo Shaft	Part 91: General Aviation	Other Work Use	3.0	0.0	0.0	0.0	VMC	MANEUVERING	Probable Cause	01/15/2009	2007-12-30	2009-01-15	2007	12
3	20080114X00045	Accident	LAX08FA043	12/30/2007	Paso Robles, CA	United States	35.542222	-120.522778	PRB	Paso Robles Airport	Fatal(1)	Substantial	Airplane	N254SR	Cirrus Design Corp.	SR22	No	1	Reciprocating	Part 91: General Aviation	Personal	1.0	NaN	NaN	NaN	VMC	MANEUVERING	Probable Cause	06/20/2014	2007-12-30	2014-06-20	2007	12
4	20080129X00122	Accident	CHI08CA057	12/30/2007	Alexandria, MN	United States	45.866111	-95.394444	AXN	Chandler Field Airport	Non-Fatal	Substantial	Airplane	N5093F	Lerohl	RV-8	Yes	1	Reciprocating	Part 91: General Aviation	Personal	NaN	NaN	NaN	1.0	VMC	TAKEOFF	Probable Cause	02/28/2008	2007-12-30	2008-02-28	2007	12

	Actual	Predicted
0	2.0	2.784548
1	1.0	3.112601
2	1.0	2.777164
3	3.0	2.755540
4	1.0	2.862078
5	2.0	2.875264
6	4.0	2.576218
7	1.0	2.685921
8	1.0	4.423759
9	1.0	2.756067

NTSB Investigations of Aircraft Incursions in the USA¶

https://github.com/jallard1/jallard1.github.io ¶

https://jallard1.github.io ¶

By Jesse Allard¶

CMSC 320 Fall 2020 Final Project¶

Professor Dickerson¶

Introduction¶

Prerequisites¶

Step 1:Gathering the Data¶

Step 2: Visualizing the Data¶

Step 3: Making predictions from the data¶

Conclusions¶

https://jallard1.github.io ¶

https://github.com/jallard1/jallard1.github.io ¶

NTSB Investigations of Aircraft Incursions in the USA¶

https://github.com/jallard1/jallard1.github.io¶

https://jallard1.github.io¶

By Jesse Allard¶

CMSC 320 Fall 2020 Final Project¶

Professor Dickerson¶

Introduction¶

Prerequisites¶

Step 1:Gathering the Data¶

Step 2: Visualizing the Data¶

Step 3: Making predictions from the data¶

Conclusions¶

https://jallard1.github.io¶

https://github.com/jallard1/jallard1.github.io¶

https://github.com/jallard1/jallard1.github.io ¶

https://jallard1.github.io ¶

https://jallard1.github.io ¶

https://github.com/jallard1/jallard1.github.io ¶