classification

In [1]:

import pandas as pd

In [3]:

data = pd.read_csv(r"D:\ml-python\regression\Airline_Delay_Cause.csv")

In [4]:

data.head()

Out[4]:

	year	month	carrier	carrier_name	airport	airport_name	arr_flights	arr_del15	carrier_ct	weather_ct	...	late_aircraft_ct	arr_cancelled	arr_delay	carrier_delay	weather_delay	nas_delay	late_aircraft_delay
0	2022	5	9E	Endeavor Air Inc.	ABE	Allentown/Bethlehem/Easton, PA: Lehigh Valley ...	136.0	7.0	5.95	0.00	...	1.00	0.0	255.0	222.0	0.0	4.0	29.0
1	2022	5	9E	Endeavor Air Inc.	ABY	Albany, GA: Southwest Georgia Regional	91.0	16.0	7.38	0.00	...	6.09	0.0	884.0	351.0	0.0	81.0	452.0
2	2022	5	9E	Endeavor Air Inc.	ACK	Nantucket, MA: Nantucket Memorial	19.0	2.0	0.13	0.00	...	0.88	1.0	138.0	4.0	0.0	106.0	28.0
3	2022	5	9E	Endeavor Air Inc.	AEX	Alexandria, LA: Alexandria International	88.0	14.0	7.26	0.76	...	1.64	0.0	947.0	585.0	35.0	125.0	202.0
4	2022	5	9E	Endeavor Air Inc.	AGS	Augusta, GA: Augusta Regional at Bush Field	181.0	19.0	13.84	0.00	...	2.09	0.0	808.0	662.0	0.0	87.0	59.0

5 rows × 21 columns

In [5]:

data.dropna(inplace=True)

In [11]:

data.columns

Out[11]:

Index(['year', 'month', 'carrier', 'carrier_name', 'airport', 'airport_name',
       'arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
       'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
       'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay'],
      dtype='object')

In [13]:

y=data['carrier']
X=data[['arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
       'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
       'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay']]

In [15]:

X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 317261 entries, 0 to 318016
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   arr_flights          317261 non-null  float64
 1   arr_del15            317261 non-null  float64
 2   carrier_ct           317261 non-null  float64
 3   weather_ct           317261 non-null  float64
 4   nas_ct               317261 non-null  float64
 5   security_ct          317261 non-null  float64
 6   late_aircraft_ct     317261 non-null  float64
 7   arr_cancelled        317261 non-null  float64
 8   arr_diverted         317261 non-null  float64
 9   arr_delay            317261 non-null  float64
 10  carrier_delay        317261 non-null  float64
 11  weather_delay        317261 non-null  float64
 12  nas_delay            317261 non-null  float64
 13  security_delay       317261 non-null  float64
 14  late_aircraft_delay  317261 non-null  float64
dtypes: float64(15)
memory usage: 38.7 MB

In [18]:

y.head()

Out[18]:

0    9E
1    9E
2    9E
3    9E
4    9E
Name: carrier, dtype: object

binary class¶

In [23]:

Data = data[data['carrier'].isin(['OO','DL'])]

In [24]:

Data.info()

Out[24]:

	year	month	carrier	carrier_name	airport	airport_name	arr_flights	arr_del15	carrier_ct	weather_ct	...	security_ct	late_aircraft_ct	arr_cancelled	arr_diverted	arr_delay	carrier_delay	weather_delay	nas_delay	security_delay	late_aircraft_delay
352	2022	5	DL	Delta Air Lines Inc.	ABQ	Albuquerque, NM: Albuquerque International Sun...	93.0	12.0	6.64	1.00	...	0.00	4.25	2.0	0.0	429.0	218.0	49.0	4.0	0.0	158.0
353	2022	5	DL	Delta Air Lines Inc.	ALB	Albany, NY: Albany International	93.0	28.0	16.07	0.00	...	0.00	5.87	5.0	0.0	1374.0	602.0	0.0	199.0	0.0	573.0
354	2022	5	DL	Delta Air Lines Inc.	ANC	Anchorage, AK: Ted Stevens Anchorage Internati...	215.0	25.0	17.90	0.00	...	0.00	4.16	2.0	1.0	958.0	676.0	0.0	63.0	0.0	219.0
355	2022	5	DL	Delta Air Lines Inc.	ATL	Atlanta, GA: Hartsfield-Jackson Atlanta Intern...	18297.0	2573.0	1012.39	95.51	...	0.52	963.54	444.0	23.0	193784.0	102534.0	7741.0	19640.0	34.0	63835.0
356	2022	5	DL	Delta Air Lines Inc.	ATW	Appleton, WI: Appleton International	31.0	3.0	2.00	0.00	...	0.00	0.00	0.0	0.0	109.0	89.0	0.0	20.0	0.0	0.0

5 rows × 21 columns

In [25]:

y=Data['carrier']
X=Data[['arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
       'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
       'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay']]

In [26]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=44, shuffle =True)

#Splitted Data
print('X_train shape is ' , X_train.shape)
print('X_test shape is ' , X_test.shape)
print('y_train shape is ' , y_train.shape)
print('y_test shape is ' , y_test.shape)

X_train shape is  (49167, 15)
X_test shape is  (16389, 15)
y_train shape is  (49167,)
y_test shape is  (16389,)

In [27]:

from sklearn.ensemble import RandomForestClassifier
RandomForestClassifierModel = RandomForestClassifier(criterion = 'gini',n_estimators=100,max_depth=2,random_state=33) #criterion can be also : entropy 
RandomForestClassifierModel.fit(X_train, y_train)

Out[27]:

RandomForestClassifier(max_depth=2, random_state=33)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [28]:

y_pred = RandomForestClassifierModel.predict(X_test)
y_pred_prob = RandomForestClassifierModel.predict_proba(X_test)

In [29]:

y_pred

Out[29]:

array(['DL', 'OO', 'OO', ..., 'OO', 'OO', 'OO'], dtype=object)

In [31]:

from sklearn.metrics import classification_report
ClassificationReport = classification_report(y_test,y_pred)
print('Classification Report is : ', ClassificationReport )

Classification Report is :                precision    recall  f1-score   support

          DL       0.72      0.45      0.55      6838
          OO       0.69      0.87      0.77      9551

    accuracy                           0.70     16389
   macro avg       0.70      0.66      0.66     16389
weighted avg       0.70      0.70      0.68     16389

In [ ]:

In [30]:

y_pred_prob

Out[30]:

array([[0.56346272, 0.43653728],
       [0.25932051, 0.74067949],
       [0.33315407, 0.66684593],
       ...,
       [0.44088871, 0.55911129],
       [0.49383778, 0.50616222],
       [0.34346312, 0.65653688]])

In [ ]:

In [1]:

import pandas as pd

In [3]:

data = pd.read_csv(r"D:\ml-python\regression\Airline_Delay_Cause.csv")

In [4]:

data.head()

Out[4]:

	year	month	carrier	carrier_name	airport	airport_name	arr_flights	arr_del15	carrier_ct	weather_ct	...	late_aircraft_ct	arr_cancelled	arr_delay	carrier_delay	weather_delay	nas_delay	late_aircraft_delay
0	2022	5	9E	Endeavor Air Inc.	ABE	Allentown/Bethlehem/Easton, PA: Lehigh Valley ...	136.0	7.0	5.95	0.00	...	1.00	0.0	255.0	222.0	0.0	4.0	29.0
1	2022	5	9E	Endeavor Air Inc.	ABY	Albany, GA: Southwest Georgia Regional	91.0	16.0	7.38	0.00	...	6.09	0.0	884.0	351.0	0.0	81.0	452.0
2	2022	5	9E	Endeavor Air Inc.	ACK	Nantucket, MA: Nantucket Memorial	19.0	2.0	0.13	0.00	...	0.88	1.0	138.0	4.0	0.0	106.0	28.0
3	2022	5	9E	Endeavor Air Inc.	AEX	Alexandria, LA: Alexandria International	88.0	14.0	7.26	0.76	...	1.64	0.0	947.0	585.0	35.0	125.0	202.0
4	2022	5	9E	Endeavor Air Inc.	AGS	Augusta, GA: Augusta Regional at Bush Field	181.0	19.0	13.84	0.00	...	2.09	0.0	808.0	662.0	0.0	87.0	59.0

5 rows × 21 columns

In [5]:

data.dropna(inplace=True)

In [11]:

data.columns

Out[11]:

Index(['year', 'month', 'carrier', 'carrier_name', 'airport', 'airport_name',
       'arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
       'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
       'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay'],
      dtype='object')

In [13]:

y=data['carrier']
X=data[['arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
       'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
       'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay']]

In [15]:

X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 317261 entries, 0 to 318016
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   arr_flights          317261 non-null  float64
 1   arr_del15            317261 non-null  float64
 2   carrier_ct           317261 non-null  float64
 3   weather_ct           317261 non-null  float64
 4   nas_ct               317261 non-null  float64
 5   security_ct          317261 non-null  float64
 6   late_aircraft_ct     317261 non-null  float64
 7   arr_cancelled        317261 non-null  float64
 8   arr_diverted         317261 non-null  float64
 9   arr_delay            317261 non-null  float64
 10  carrier_delay        317261 non-null  float64
 11  weather_delay        317261 non-null  float64
 12  nas_delay            317261 non-null  float64
 13  security_delay       317261 non-null  float64
 14  late_aircraft_delay  317261 non-null  float64
dtypes: float64(15)
memory usage: 38.7 MB

In [18]:

y.head()

Out[18]:

0    9E
1    9E
2    9E
3    9E
4    9E
Name: carrier, dtype: object

multi class¶

In [23]:

Data = data[data['carrier'].isin(['OO','DL'])]

In [24]:

Data.info()

Out[24]:

	year	month	carrier	carrier_name	airport	airport_name	arr_flights	arr_del15	carrier_ct	weather_ct	...	security_ct	late_aircraft_ct	arr_cancelled	arr_diverted	arr_delay	carrier_delay	weather_delay	nas_delay	security_delay	late_aircraft_delay
352	2022	5	DL	Delta Air Lines Inc.	ABQ	Albuquerque, NM: Albuquerque International Sun...	93.0	12.0	6.64	1.00	...	0.00	4.25	2.0	0.0	429.0	218.0	49.0	4.0	0.0	158.0
353	2022	5	DL	Delta Air Lines Inc.	ALB	Albany, NY: Albany International	93.0	28.0	16.07	0.00	...	0.00	5.87	5.0	0.0	1374.0	602.0	0.0	199.0	0.0	573.0
354	2022	5	DL	Delta Air Lines Inc.	ANC	Anchorage, AK: Ted Stevens Anchorage Internati...	215.0	25.0	17.90	0.00	...	0.00	4.16	2.0	1.0	958.0	676.0	0.0	63.0	0.0	219.0
355	2022	5	DL	Delta Air Lines Inc.	ATL	Atlanta, GA: Hartsfield-Jackson Atlanta Intern...	18297.0	2573.0	1012.39	95.51	...	0.52	963.54	444.0	23.0	193784.0	102534.0	7741.0	19640.0	34.0	63835.0
356	2022	5	DL	Delta Air Lines Inc.	ATW	Appleton, WI: Appleton International	31.0	3.0	2.00	0.00	...	0.00	0.00	0.0	0.0	109.0	89.0	0.0	20.0	0.0	0.0

5 rows × 21 columns

In [32]:

y=data['carrier']
X=data[['arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct',
       'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted',
       'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay']]

In [33]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=44, shuffle =True)

#Splitted Data
print('X_train shape is ' , X_train.shape)
print('X_test shape is ' , X_test.shape)
print('y_train shape is ' , y_train.shape)
print('y_test shape is ' , y_test.shape)

X_train shape is  (237945, 15)
X_test shape is  (79316, 15)
y_train shape is  (237945,)
y_test shape is  (79316,)

In [34]:

from sklearn.ensemble import RandomForestClassifier
RandomForestClassifierModel = RandomForestClassifier(criterion = 'gini',n_estimators=100,max_depth=2,random_state=33) #criterion can be also : entropy 
RandomForestClassifierModel.fit(X_train, y_train)

Out[34]:

RandomForestClassifier(max_depth=2, random_state=33)

In [35]:

y_pred = RandomForestClassifierModel.predict(X_test)
y_pred_prob = RandomForestClassifierModel.predict_proba(X_test)

In [36]:

y_pred

Out[36]:

array(['OO', 'WN', 'WN', ..., 'OO', 'OO', 'WN'], dtype=object)

In [37]:

from sklearn.metrics import classification_report
ClassificationReport = classification_report(y_test,y_pred)
print('Classification Report is : ', ClassificationReport )

D:\anaconda1\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
D:\anaconda1\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Classification Report is :                precision    recall  f1-score   support

          9E       0.00      0.00      0.00      3048
          AA       0.00      0.00      0.00      4925
          AQ       0.00      0.00      0.00        69
          AS       0.00      0.00      0.00      3189
          B6       0.00      0.00      0.00      2954
          CO       0.00      0.00      0.00      1712
          DH       0.00      0.00      0.00       488
          DL       0.31      0.12      0.17      6884
          EV       0.00      0.00      0.00      6601
          F9       0.00      0.00      0.00      2993
          FL       0.00      0.00      0.00      1715
          G4       0.00      0.00      0.00      1600
          HA       0.00      0.00      0.00       878
          HP       0.00      0.00      0.00       426
          MQ       0.00      0.00      0.00      6119
          NK       0.00      0.00      0.00       946
          NW       0.00      0.00      0.00      2010
          OH       0.00      0.00      0.00      3062
          OO       0.13      0.87      0.22      9455
          QX       0.00      0.00      0.00       205
          RU       0.00      0.00      0.00      1073
          TZ       0.00      0.00      0.00       232
          UA       0.00      0.00      0.00      4746
          US       0.00      0.00      0.00      2707
          VX       0.00      0.00      0.00       371
          WN       0.24      0.68      0.36      4331
          XE       0.00      0.00      0.00      1837
          YV       0.00      0.00      0.00      3665
          YX       0.00      0.00      0.00      1075

    accuracy                           0.15     79316
   macro avg       0.02      0.06      0.03     79316
weighted avg       0.06      0.15      0.06     79316

D:\anaconda1\Lib\site-packages\sklearn\metrics\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

In [ ]:

In [38]:

y_pred_prob

Out[38]:

array([[0.04235015, 0.05656573, 0.00062612, ..., 0.026474  , 0.04870011,
        0.0136786 ],
       [0.01745506, 0.09516632, 0.00091928, ..., 0.02850223, 0.02705532,
        0.01253049],
       [0.01531888, 0.11541461, 0.00038751, ..., 0.01516527, 0.01984721,
        0.01256389],
       ...,
       [0.03853651, 0.03384994, 0.00187478, ..., 0.01727916, 0.06874375,
        0.01684659],
       [0.04569009, 0.02961095, 0.0016231 , ..., 0.01543546, 0.07113034,
        0.02054276],
       [0.01376051, 0.0962056 , 0.00057358, ..., 0.01504365, 0.02120843,
        0.01050682]])

In [ ]:

Menu

classification

binary class¶

multi class¶

0 Comments

Tags

Contact form

Menu

classification

binary class¶

multi class¶

You may like these posts

0 Comments

Tags

Contact form