In [1]:
import pandas as pd
In [3]:
data = pd.read_csv(r"D:\ml-python\regression\Airline_Delay_Cause.csv")
In [4]:
data.head()
Out[4]:
| year | month | carrier | carrier_name | airport | airport_name | arr_flights | arr_del15 | carrier_ct | weather_ct | ... | security_ct | late_aircraft_ct | arr_cancelled | arr_diverted | arr_delay | carrier_delay | weather_delay | nas_delay | security_delay | late_aircraft_delay | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022 | 5 | 9E | Endeavor Air Inc. | ABE | Allentown/Bethlehem/Easton, PA: Lehigh Valley ... | 136.0 | 7.0 | 5.95 | 0.00 | ... | 0.0 | 1.00 | 0.0 | 0.0 | 255.0 | 222.0 | 0.0 | 4.0 | 0.0 | 29.0 |
| 1 | 2022 | 5 | 9E | Endeavor Air Inc. | ABY | Albany, GA: Southwest Georgia Regional | 91.0 | 16.0 | 7.38 | 0.00 | ... | 0.0 | 6.09 | 0.0 | 0.0 | 884.0 | 351.0 | 0.0 | 81.0 | 0.0 | 452.0 |
| 2 | 2022 | 5 | 9E | Endeavor Air Inc. | ACK | Nantucket, MA: Nantucket Memorial | 19.0 | 2.0 | 0.13 | 0.00 | ... | 0.0 | 0.88 | 1.0 | 0.0 | 138.0 | 4.0 | 0.0 | 106.0 | 0.0 | 28.0 |
| 3 | 2022 | 5 | 9E | Endeavor Air Inc. | AEX | Alexandria, LA: Alexandria International | 88.0 | 14.0 | 7.26 | 0.76 | ... | 0.0 | 1.64 | 0.0 | 0.0 | 947.0 | 585.0 | 35.0 | 125.0 | 0.0 | 202.0 |
| 4 | 2022 | 5 | 9E | Endeavor Air Inc. | AGS | Augusta, GA: Augusta Regional at Bush Field | 181.0 | 19.0 | 13.84 | 0.00 | ... | 0.0 | 2.09 | 0.0 | 0.0 | 808.0 | 662.0 | 0.0 | 87.0 | 0.0 | 59.0 |
5 rows × 21 columns
In [5]:
data.dropna(inplace=True)
In [11]:
data.columns
Out[11]:
Index(['year', 'month', 'carrier', 'carrier_name', 'airport', 'airport_name',
'arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
'security_delay', 'late_aircraft_delay'],
dtype='object')
In [13]:
y=data['carrier']
X=data[['arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
'security_delay', 'late_aircraft_delay']]
In [15]:
X.info()
<class 'pandas.core.frame.DataFrame'> Index: 317261 entries, 0 to 318016 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 arr_flights 317261 non-null float64 1 arr_del15 317261 non-null float64 2 carrier_ct 317261 non-null float64 3 weather_ct 317261 non-null float64 4 nas_ct 317261 non-null float64 5 security_ct 317261 non-null float64 6 late_aircraft_ct 317261 non-null float64 7 arr_cancelled 317261 non-null float64 8 arr_diverted 317261 non-null float64 9 arr_delay 317261 non-null float64 10 carrier_delay 317261 non-null float64 11 weather_delay 317261 non-null float64 12 nas_delay 317261 non-null float64 13 security_delay 317261 non-null float64 14 late_aircraft_delay 317261 non-null float64 dtypes: float64(15) memory usage: 38.7 MB
In [18]:
y.head()
Out[18]:
0 9E 1 9E 2 9E 3 9E 4 9E Name: carrier, dtype: object
binary class¶
In [23]:
Data = data[data['carrier'].isin(['OO','DL'])]
In [24]:
Data.info()
Out[24]:
| year | month | carrier | carrier_name | airport | airport_name | arr_flights | arr_del15 | carrier_ct | weather_ct | ... | security_ct | late_aircraft_ct | arr_cancelled | arr_diverted | arr_delay | carrier_delay | weather_delay | nas_delay | security_delay | late_aircraft_delay | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 352 | 2022 | 5 | DL | Delta Air Lines Inc. | ABQ | Albuquerque, NM: Albuquerque International Sun... | 93.0 | 12.0 | 6.64 | 1.00 | ... | 0.00 | 4.25 | 2.0 | 0.0 | 429.0 | 218.0 | 49.0 | 4.0 | 0.0 | 158.0 |
| 353 | 2022 | 5 | DL | Delta Air Lines Inc. | ALB | Albany, NY: Albany International | 93.0 | 28.0 | 16.07 | 0.00 | ... | 0.00 | 5.87 | 5.0 | 0.0 | 1374.0 | 602.0 | 0.0 | 199.0 | 0.0 | 573.0 |
| 354 | 2022 | 5 | DL | Delta Air Lines Inc. | ANC | Anchorage, AK: Ted Stevens Anchorage Internati... | 215.0 | 25.0 | 17.90 | 0.00 | ... | 0.00 | 4.16 | 2.0 | 1.0 | 958.0 | 676.0 | 0.0 | 63.0 | 0.0 | 219.0 |
| 355 | 2022 | 5 | DL | Delta Air Lines Inc. | ATL | Atlanta, GA: Hartsfield-Jackson Atlanta Intern... | 18297.0 | 2573.0 | 1012.39 | 95.51 | ... | 0.52 | 963.54 | 444.0 | 23.0 | 193784.0 | 102534.0 | 7741.0 | 19640.0 | 34.0 | 63835.0 |
| 356 | 2022 | 5 | DL | Delta Air Lines Inc. | ATW | Appleton, WI: Appleton International | 31.0 | 3.0 | 2.00 | 0.00 | ... | 0.00 | 0.00 | 0.0 | 0.0 | 109.0 | 89.0 | 0.0 | 20.0 | 0.0 | 0.0 |
5 rows × 21 columns
In [25]:
y=Data['carrier']
X=Data[['arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
'security_delay', 'late_aircraft_delay']]
In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=44, shuffle =True)
#Splitted Data
print('X_train shape is ' , X_train.shape)
print('X_test shape is ' , X_test.shape)
print('y_train shape is ' , y_train.shape)
print('y_test shape is ' , y_test.shape)
X_train shape is (49167, 15) X_test shape is (16389, 15) y_train shape is (49167,) y_test shape is (16389,)
In [27]:
from sklearn.ensemble import RandomForestClassifier
RandomForestClassifierModel = RandomForestClassifier(criterion = 'gini',n_estimators=100,max_depth=2,random_state=33) #criterion can be also : entropy
RandomForestClassifierModel.fit(X_train, y_train)
Out[27]:
RandomForestClassifier(max_depth=2, random_state=33)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_depth=2, random_state=33)
In [28]:
y_pred = RandomForestClassifierModel.predict(X_test)
y_pred_prob = RandomForestClassifierModel.predict_proba(X_test)
In [29]:
y_pred
Out[29]:
array(['DL', 'OO', 'OO', ..., 'OO', 'OO', 'OO'], dtype=object)
In [31]:
from sklearn.metrics import classification_report
ClassificationReport = classification_report(y_test,y_pred)
print('Classification Report is : ', ClassificationReport )
Classification Report is : precision recall f1-score support
DL 0.72 0.45 0.55 6838
OO 0.69 0.87 0.77 9551
accuracy 0.70 16389
macro avg 0.70 0.66 0.66 16389
weighted avg 0.70 0.70 0.68 16389
In [ ]:
In [ ]:
In [ ]:
In [30]:
y_pred_prob
Out[30]:
array([[0.56346272, 0.43653728],
[0.25932051, 0.74067949],
[0.33315407, 0.66684593],
...,
[0.44088871, 0.55911129],
[0.49383778, 0.50616222],
[0.34346312, 0.65653688]])
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [1]:
import pandas as pd
In [3]:
data = pd.read_csv(r"D:\ml-python\regression\Airline_Delay_Cause.csv")
In [4]:
data.head()
Out[4]:
| year | month | carrier | carrier_name | airport | airport_name | arr_flights | arr_del15 | carrier_ct | weather_ct | ... | security_ct | late_aircraft_ct | arr_cancelled | arr_diverted | arr_delay | carrier_delay | weather_delay | nas_delay | security_delay | late_aircraft_delay | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022 | 5 | 9E | Endeavor Air Inc. | ABE | Allentown/Bethlehem/Easton, PA: Lehigh Valley ... | 136.0 | 7.0 | 5.95 | 0.00 | ... | 0.0 | 1.00 | 0.0 | 0.0 | 255.0 | 222.0 | 0.0 | 4.0 | 0.0 | 29.0 |
| 1 | 2022 | 5 | 9E | Endeavor Air Inc. | ABY | Albany, GA: Southwest Georgia Regional | 91.0 | 16.0 | 7.38 | 0.00 | ... | 0.0 | 6.09 | 0.0 | 0.0 | 884.0 | 351.0 | 0.0 | 81.0 | 0.0 | 452.0 |
| 2 | 2022 | 5 | 9E | Endeavor Air Inc. | ACK | Nantucket, MA: Nantucket Memorial | 19.0 | 2.0 | 0.13 | 0.00 | ... | 0.0 | 0.88 | 1.0 | 0.0 | 138.0 | 4.0 | 0.0 | 106.0 | 0.0 | 28.0 |
| 3 | 2022 | 5 | 9E | Endeavor Air Inc. | AEX | Alexandria, LA: Alexandria International | 88.0 | 14.0 | 7.26 | 0.76 | ... | 0.0 | 1.64 | 0.0 | 0.0 | 947.0 | 585.0 | 35.0 | 125.0 | 0.0 | 202.0 |
| 4 | 2022 | 5 | 9E | Endeavor Air Inc. | AGS | Augusta, GA: Augusta Regional at Bush Field | 181.0 | 19.0 | 13.84 | 0.00 | ... | 0.0 | 2.09 | 0.0 | 0.0 | 808.0 | 662.0 | 0.0 | 87.0 | 0.0 | 59.0 |
5 rows × 21 columns
In [5]:
data.dropna(inplace=True)
In [11]:
data.columns
Out[11]:
Index(['year', 'month', 'carrier', 'carrier_name', 'airport', 'airport_name',
'arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
'security_delay', 'late_aircraft_delay'],
dtype='object')
In [13]:
y=data['carrier']
X=data[['arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
'security_delay', 'late_aircraft_delay']]
In [15]:
X.info()
<class 'pandas.core.frame.DataFrame'> Index: 317261 entries, 0 to 318016 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 arr_flights 317261 non-null float64 1 arr_del15 317261 non-null float64 2 carrier_ct 317261 non-null float64 3 weather_ct 317261 non-null float64 4 nas_ct 317261 non-null float64 5 security_ct 317261 non-null float64 6 late_aircraft_ct 317261 non-null float64 7 arr_cancelled 317261 non-null float64 8 arr_diverted 317261 non-null float64 9 arr_delay 317261 non-null float64 10 carrier_delay 317261 non-null float64 11 weather_delay 317261 non-null float64 12 nas_delay 317261 non-null float64 13 security_delay 317261 non-null float64 14 late_aircraft_delay 317261 non-null float64 dtypes: float64(15) memory usage: 38.7 MB
In [18]:
y.head()
Out[18]:
0 9E 1 9E 2 9E 3 9E 4 9E Name: carrier, dtype: object
multi class¶
In [23]:
Data = data[data['carrier'].isin(['OO','DL'])]
In [24]:
Data.info()
Out[24]:
| year | month | carrier | carrier_name | airport | airport_name | arr_flights | arr_del15 | carrier_ct | weather_ct | ... | security_ct | late_aircraft_ct | arr_cancelled | arr_diverted | arr_delay | carrier_delay | weather_delay | nas_delay | security_delay | late_aircraft_delay | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 352 | 2022 | 5 | DL | Delta Air Lines Inc. | ABQ | Albuquerque, NM: Albuquerque International Sun... | 93.0 | 12.0 | 6.64 | 1.00 | ... | 0.00 | 4.25 | 2.0 | 0.0 | 429.0 | 218.0 | 49.0 | 4.0 | 0.0 | 158.0 |
| 353 | 2022 | 5 | DL | Delta Air Lines Inc. | ALB | Albany, NY: Albany International | 93.0 | 28.0 | 16.07 | 0.00 | ... | 0.00 | 5.87 | 5.0 | 0.0 | 1374.0 | 602.0 | 0.0 | 199.0 | 0.0 | 573.0 |
| 354 | 2022 | 5 | DL | Delta Air Lines Inc. | ANC | Anchorage, AK: Ted Stevens Anchorage Internati... | 215.0 | 25.0 | 17.90 | 0.00 | ... | 0.00 | 4.16 | 2.0 | 1.0 | 958.0 | 676.0 | 0.0 | 63.0 | 0.0 | 219.0 |
| 355 | 2022 | 5 | DL | Delta Air Lines Inc. | ATL | Atlanta, GA: Hartsfield-Jackson Atlanta Intern... | 18297.0 | 2573.0 | 1012.39 | 95.51 | ... | 0.52 | 963.54 | 444.0 | 23.0 | 193784.0 | 102534.0 | 7741.0 | 19640.0 | 34.0 | 63835.0 |
| 356 | 2022 | 5 | DL | Delta Air Lines Inc. | ATW | Appleton, WI: Appleton International | 31.0 | 3.0 | 2.00 | 0.00 | ... | 0.00 | 0.00 | 0.0 | 0.0 | 109.0 | 89.0 | 0.0 | 20.0 | 0.0 | 0.0 |
5 rows × 21 columns
In [32]:
y=data['carrier']
X=data[['arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
'security_delay', 'late_aircraft_delay']]
In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=44, shuffle =True)
#Splitted Data
print('X_train shape is ' , X_train.shape)
print('X_test shape is ' , X_test.shape)
print('y_train shape is ' , y_train.shape)
print('y_test shape is ' , y_test.shape)
X_train shape is (237945, 15) X_test shape is (79316, 15) y_train shape is (237945,) y_test shape is (79316,)
In [34]:
from sklearn.ensemble import RandomForestClassifier
RandomForestClassifierModel = RandomForestClassifier(criterion = 'gini',n_estimators=100,max_depth=2,random_state=33) #criterion can be also : entropy
RandomForestClassifierModel.fit(X_train, y_train)
Out[34]:
RandomForestClassifier(max_depth=2, random_state=33)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_depth=2, random_state=33)
In [35]:
y_pred = RandomForestClassifierModel.predict(X_test)
y_pred_prob = RandomForestClassifierModel.predict_proba(X_test)
In [36]:
y_pred
Out[36]:
array(['OO', 'WN', 'WN', ..., 'OO', 'OO', 'WN'], dtype=object)
In [37]:
from sklearn.metrics import classification_report
ClassificationReport = classification_report(y_test,y_pred)
print('Classification Report is : ', ClassificationReport )
D:\anaconda1\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
D:\anaconda1\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Classification Report is : precision recall f1-score support
9E 0.00 0.00 0.00 3048
AA 0.00 0.00 0.00 4925
AQ 0.00 0.00 0.00 69
AS 0.00 0.00 0.00 3189
B6 0.00 0.00 0.00 2954
CO 0.00 0.00 0.00 1712
DH 0.00 0.00 0.00 488
DL 0.31 0.12 0.17 6884
EV 0.00 0.00 0.00 6601
F9 0.00 0.00 0.00 2993
FL 0.00 0.00 0.00 1715
G4 0.00 0.00 0.00 1600
HA 0.00 0.00 0.00 878
HP 0.00 0.00 0.00 426
MQ 0.00 0.00 0.00 6119
NK 0.00 0.00 0.00 946
NW 0.00 0.00 0.00 2010
OH 0.00 0.00 0.00 3062
OO 0.13 0.87 0.22 9455
QX 0.00 0.00 0.00 205
RU 0.00 0.00 0.00 1073
TZ 0.00 0.00 0.00 232
UA 0.00 0.00 0.00 4746
US 0.00 0.00 0.00 2707
VX 0.00 0.00 0.00 371
WN 0.24 0.68 0.36 4331
XE 0.00 0.00 0.00 1837
YV 0.00 0.00 0.00 3665
YX 0.00 0.00 0.00 1075
accuracy 0.15 79316
macro avg 0.02 0.06 0.03 79316
weighted avg 0.06 0.15 0.06 79316
D:\anaconda1\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
In [ ]:
In [ ]:
In [ ]:
In [38]:
y_pred_prob
Out[38]:
array([[0.04235015, 0.05656573, 0.00062612, ..., 0.026474 , 0.04870011,
0.0136786 ],
[0.01745506, 0.09516632, 0.00091928, ..., 0.02850223, 0.02705532,
0.01253049],
[0.01531888, 0.11541461, 0.00038751, ..., 0.01516527, 0.01984721,
0.01256389],
...,
[0.03853651, 0.03384994, 0.00187478, ..., 0.01727916, 0.06874375,
0.01684659],
[0.04569009, 0.02961095, 0.0016231 , ..., 0.01543546, 0.07113034,
0.02054276],
[0.01376051, 0.0962056 , 0.00057358, ..., 0.01504365, 0.02120843,
0.01050682]])
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
0 Comments