



Study with the several resources on Docsity
Earn points by helping other students or get them with a premium plan
Prepare for your exams
Study with the several resources on Docsity
Earn points to download
Earn points by helping other students or get them with a premium plan
A comprehensive analysis of a flight delay dataset using python and various libraries such as pandas, numpy, matplotlib, seaborn, and sklearn. The analysis includes data cleaning, exploratory data analysis, and model building using convolutional neural networks (cnn). The dataset contains information about flight delays, airline types, and other relevant factors. The goal is to understand the factors contributing to flight delays and build a model to predict them.
Typology: Study Guides, Projects, Research
1 / 7
This page cannot be seen from the preview
Don't miss anything!




import pandas as pd import numpy as np #Importing dataset #from google.colab import drive #drive.mount('/drive') path = ('flight_data.csv') df = pd.read_csv(path) df.head( 4 ) print ("This dataset size is:\n",df.shape) pd.set_option('display.max_columns', None) df.describe()
pd.set_option('display.max_rows', None) df.dtypes #Column Names df.columns row, col = df.shape print(f'There are {row} rows and {col} columns in this dataset.')
uniqueValues = df.nunique() print('Count of unique values in each column :\n') print(uniqueValues) #Checking Missing values def missing_values_table(df): missing_vals = df.isnull().sum(axis = 0 ) count_missing_cols = df.isnull().sum (axis= 1 ) missing_vals_percent = 100 * df.isnull().sum() / len(df) missing_vals_tbl = pd.concat([missing_vals, missing_vals_percent], axis= 1 ) missing_vals_tbl = missing_vals_tbl.rename(columns = { 0 : 'Missing Values', 1 : '% of Missing Values'}) missing_vals_tbl['Data Type'] = df.dtypes return missing_vals_tbl
pd.set_option('display.max_rows', None) missing_values_table(df) #target variable = DepDelayMinutes #dropping records of DepDelayMinutes that has '0' value bcus 0 means no delay DropRecords = df[df['DEP_DELAY_NEW'] == 0 ].index df.drop(DropRecords , inplace=True) print("Dataset size after dropping records with '0' values in DepDelayMinutes : \n\n") print(df.shape) flights = df.drop(["Unnamed: 42", "MONTH", "DAY_OF_MONTH", "DAY_OF_WEEK", "FL_DATE", "UNIQUE_CARRIER", "FL_NUM", "ORIGIN_AIRPORT_SEQ_ID", "ORIGIN_CITY_MARKET_ID", "ORIGIN", "ORIGIN_CITY_NAME", "ORIGIN_STATE_ABR", "ORIGIN_STATE_NM", "DEST_AIRPORT_SEQ_ID", "DEST_CITY_MARKET_ID", "DEST", "DEST_CITY_NAME", "DEST_STATE_ABR", "DEST_STATE_NM", "DEP_DELAY", "WHEELS_ON", "TAXI_IN", "CRS_ARR_TIME", "ARR_TIME", "ARR_DELAY", "ARR_DELAY_NEW", "CRS_ELAPSED_TIME", "ACTUAL_ELAPSED_TIME", "FLIGHTS", "FIRST_DEP_TIME"], axis = 1 ) #his is the final dataset size after dropping unwanted rows and columns print ("The dataset size AFTER dropping unneccessary columns is :-\n ")
x = flights.drop(['DEP_DELAY_NEW'], axis= 1 ) y = flights.DEP_DELAY_NEW from sklearn import model_selection from sklearn.model_selection import train_test_split #Split data into train and test x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.30, random_state = 42 ) print(x_train.shape) print(x_test.shape) print(y_train.shape) print(y_test.shape) print('The size of X_train dataset:\n',x_train.shape) print('\n') print('The size of X_test dataset:\n',x_test.shape) """# CNN Base Model""" import pandas as pd import numpy as np from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt plt.rcParams['figure.figsize']=(5.0,5.0) #!pip install keras !pip install tensorflow from keras.models import Sequential import keras from keras.layers import Dense,Flatten, Conv1D import tensorflow as tf #Base Model model6 =Sequential() model6.add(Conv1D( 32 ,( 3 ),input_shape=( 12 , 1 ),activation ='relu')) model6.add(Flatten()) model6.add(Dense( 64 ,activation='relu')) model6.add(Dense( 1 ,activation='linear')) model6.compile(loss='mse',optimizer='adam',metrics=['mse']) callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience = 3 ) history6 = model6.fit(x_train, y_train, batch_size = 16 , epochs= 10 , callbacks=[callback], verbose= 1 ,validation_data=(x_test,y_test)) #Model Tuning 1
model1 =Sequential() model1.add(Conv1D( 64 ,( 3 ),input_shape=( 12 , 1 ),activation ='relu')) model1.add(Flatten()) model1.add(Dense( 64 ,activation='relu')) model1.add(Dense( 1 ,activation='linear')) model1.compile(loss='mse',optimizer='adam',metrics=['mse']) history1 = model1.fit(x_train, y_train, batch_size = 16 , epochs= 100 , callbacks=[callback], verbose= 1 ,validation_data=(x_test,y_test)) #Model Tuning 2 model2 =Sequential() model2.add(Conv1D( 128 ,( 3 ),input_shape=( 12 , 1 ),activation ='relu')) model2.add(Flatten()) model2.add(Dense( 64 ,activation='relu')) model2.add(Dense( 1 ,activation='linear')) model2.compile(loss='mse',optimizer='adam',metrics=['mse']) history2 = model2.fit(x_train, y_train, batch_size = 16 , epochs= 100 , callbacks=[callback], verbose= 1 ,validation_data=(x_test,y_test)) #Model Tuning 3 model3 =Sequential() model3.add(Conv1D( 64 ,( 3 ),input_shape=( 12 , 1 ),activation ='relu')) model3.add(Flatten()) model3.add(Dense( 64 ,activation='relu')) model3.add(Dense( 32 ,activation='relu')) model3.add(Dense( 1 ,activation='linear')) model3.compile(loss='mse',optimizer='adam',metrics=['mse']) history3 = model3.fit(x_train, y_train, batch_size = 16 , epochs= 100 , callbacks=[callback], verbose= 1 ,validation_data=(x_test,y_test)) #Model Tuning 4 model4 =Sequential() model4.add(Conv1D( 16 ,( 3 ),input_shape=( 12 , 1 ),activation ='relu')) model4.add(Flatten()) model4.add(Dense( 64 ,activation='relu')) model4.add(Dense( 1 ,activation='linear')) model4.compile(loss='mse',optimizer='adam',metrics=['mse']) history4 = model4.fit(x_train, y_train, batch_size = 16 , epochs= 100 , callbacks=[callback], verbose= 1 ,validation_data=(x_test,y_test))
plt.legend() plt.figure() plt.plot(epochs, loss2, 'b', label="Train loss") plt.plot(epochs, val_loss2, 'r', label='Test loss') plt.legend() plt.show() #GRAPH 3 - Model Tuning 3 import matplotlib.pyplot as plt mse3=history3.history['mse'] val_mse3=history3.history['val_mse'] loss3=history3.history['loss'] val_loss3=history3.history['val_loss'] epochs = range( 1 , len(mse3)+ 1 ) plt.plot(epochs, mse3, 'b', label="Train MSE") plt.plot(epochs, val_mse3, 'r', label='Test MSE') plt.legend() plt.figure() plt.plot(epochs, loss3, 'b', label="Train loss") plt.plot(epochs, val_loss3, 'r', label='Test loss') plt.legend() plt.show() #GRAPH 4 - Model Tuning 4 import matplotlib.pyplot as plt mse4=history4.history['mse'] val_mse4=history4.history['val_mse'] loss4=history4.history['loss'] val_loss4=history4.history['val_loss'] epochs = range( 1 , len(mse4)+ 1 ) plt.plot(epochs, mse4, 'b', label="Train MSE") plt.plot(epochs, val_mse4, 'r', label='Test MSE') plt.legend() plt.figure() plt.plot(epochs, loss4, 'b', label="Train loss") plt.plot(epochs, val_loss4, 'r', label='Test loss') plt.legend() plt.show()