Data Preprocessing and Analysis of Flight Delay Dataset, Study Guides, Projects, Research of Machine Learning

A comprehensive analysis of a flight delay dataset using python and various libraries such as pandas, numpy, matplotlib, seaborn, and sklearn. The analysis includes data cleaning, exploratory data analysis, and model building using convolutional neural networks (cnn). The dataset contains information about flight delays, airline types, and other relevant factors. The goal is to understand the factors contributing to flight delays and build a model to predict them.

Typology: Study Guides, Projects, Research

2021/2022

Uploaded on 03/10/2024

yashwinie-seliyan
yashwinie-seliyan 🇲🇾

1 document

1 / 7

Toggle sidebar

This page cannot be seen from the preview

Don't miss anything!

bg1
import pandas as pd
import numpy as np
#Importing dataset
#from google.colab import drive
#drive.mount('/drive')
path = ('flight_data.csv')
df = pd.read_csv(path)
df.head(4)
print ("This dataset size is:\n",df.shape)
pd.set_option('display.max_columns', None)
df.describe()
# Datatypes for each attribute
pd.set_option('display.max_rows', None)
df.dtypes
#Column Names
df.columns
row, col = df.shape
print(f'There are {row} rows and {col} columns in this dataset.')
# Count of Unique values
uniqueValues = df.nunique()
print('Count of unique values in each column :\n')
print(uniqueValues)
#Checking Missing values
def missing_values_table(df):
missing_vals = df.isnull().sum(axis = 0)
count_missing_cols = df.isnull().sum (axis=1)
missing_vals_percent = 100 * df.isnull().sum() / len(df)
missing_vals_tbl = pd.concat([missing_vals, missing_vals_percent],
axis=1)
missing_vals_tbl = missing_vals_tbl.rename(columns =
{0 : 'Missing Values', 1 : '% of Missing Values'})
missing_vals_tbl['Data Type'] = df.dtypes
return missing_vals_tbl
pf3
pf4
pf5

Partial preview of the text

Download Data Preprocessing and Analysis of Flight Delay Dataset and more Study Guides, Projects, Research Machine Learning in PDF only on Docsity!

import pandas as pd import numpy as np #Importing dataset #from google.colab import drive #drive.mount('/drive') path = ('flight_data.csv') df = pd.read_csv(path) df.head( 4 ) print ("This dataset size is:\n",df.shape) pd.set_option('display.max_columns', None) df.describe()

Datatypes for each attribute

pd.set_option('display.max_rows', None) df.dtypes #Column Names df.columns row, col = df.shape print(f'There are {row} rows and {col} columns in this dataset.')

Count of Unique values

uniqueValues = df.nunique() print('Count of unique values in each column :\n') print(uniqueValues) #Checking Missing values def missing_values_table(df): missing_vals = df.isnull().sum(axis = 0 ) count_missing_cols = df.isnull().sum (axis= 1 ) missing_vals_percent = 100 * df.isnull().sum() / len(df) missing_vals_tbl = pd.concat([missing_vals, missing_vals_percent], axis= 1 ) missing_vals_tbl = missing_vals_tbl.rename(columns = { 0 : 'Missing Values', 1 : '% of Missing Values'}) missing_vals_tbl['Data Type'] = df.dtypes return missing_vals_tbl

pd.set_option('display.max_rows', None) missing_values_table(df) #target variable = DepDelayMinutes #dropping records of DepDelayMinutes that has '0' value bcus 0 means no delay DropRecords = df[df['DEP_DELAY_NEW'] == 0 ].index df.drop(DropRecords , inplace=True) print("Dataset size after dropping records with '0' values in DepDelayMinutes : \n\n") print(df.shape) flights = df.drop(["Unnamed: 42", "MONTH", "DAY_OF_MONTH", "DAY_OF_WEEK", "FL_DATE", "UNIQUE_CARRIER", "FL_NUM", "ORIGIN_AIRPORT_SEQ_ID", "ORIGIN_CITY_MARKET_ID", "ORIGIN", "ORIGIN_CITY_NAME", "ORIGIN_STATE_ABR", "ORIGIN_STATE_NM", "DEST_AIRPORT_SEQ_ID", "DEST_CITY_MARKET_ID", "DEST", "DEST_CITY_NAME", "DEST_STATE_ABR", "DEST_STATE_NM", "DEP_DELAY", "WHEELS_ON", "TAXI_IN", "CRS_ARR_TIME", "ARR_TIME", "ARR_DELAY", "ARR_DELAY_NEW", "CRS_ELAPSED_TIME", "ACTUAL_ELAPSED_TIME", "FLIGHTS", "FIRST_DEP_TIME"], axis = 1 ) #his is the final dataset size after dropping unwanted rows and columns print ("The dataset size AFTER dropping unneccessary columns is :-\n ")

x = flights.drop(['DEP_DELAY_NEW'], axis= 1 ) y = flights.DEP_DELAY_NEW from sklearn import model_selection from sklearn.model_selection import train_test_split #Split data into train and test x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.30, random_state = 42 ) print(x_train.shape) print(x_test.shape) print(y_train.shape) print(y_test.shape) print('The size of X_train dataset:\n',x_train.shape) print('\n') print('The size of X_test dataset:\n',x_test.shape) """# CNN Base Model""" import pandas as pd import numpy as np from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt plt.rcParams['figure.figsize']=(5.0,5.0) #!pip install keras !pip install tensorflow from keras.models import Sequential import keras from keras.layers import Dense,Flatten, Conv1D import tensorflow as tf #Base Model model6 =Sequential() model6.add(Conv1D( 32 ,( 3 ),input_shape=( 12 , 1 ),activation ='relu')) model6.add(Flatten()) model6.add(Dense( 64 ,activation='relu')) model6.add(Dense( 1 ,activation='linear')) model6.compile(loss='mse',optimizer='adam',metrics=['mse']) callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience = 3 ) history6 = model6.fit(x_train, y_train, batch_size = 16 , epochs= 10 , callbacks=[callback], verbose= 1 ,validation_data=(x_test,y_test)) #Model Tuning 1

model1 =Sequential() model1.add(Conv1D( 64 ,( 3 ),input_shape=( 12 , 1 ),activation ='relu')) model1.add(Flatten()) model1.add(Dense( 64 ,activation='relu')) model1.add(Dense( 1 ,activation='linear')) model1.compile(loss='mse',optimizer='adam',metrics=['mse']) history1 = model1.fit(x_train, y_train, batch_size = 16 , epochs= 100 , callbacks=[callback], verbose= 1 ,validation_data=(x_test,y_test)) #Model Tuning 2 model2 =Sequential() model2.add(Conv1D( 128 ,( 3 ),input_shape=( 12 , 1 ),activation ='relu')) model2.add(Flatten()) model2.add(Dense( 64 ,activation='relu')) model2.add(Dense( 1 ,activation='linear')) model2.compile(loss='mse',optimizer='adam',metrics=['mse']) history2 = model2.fit(x_train, y_train, batch_size = 16 , epochs= 100 , callbacks=[callback], verbose= 1 ,validation_data=(x_test,y_test)) #Model Tuning 3 model3 =Sequential() model3.add(Conv1D( 64 ,( 3 ),input_shape=( 12 , 1 ),activation ='relu')) model3.add(Flatten()) model3.add(Dense( 64 ,activation='relu')) model3.add(Dense( 32 ,activation='relu')) model3.add(Dense( 1 ,activation='linear')) model3.compile(loss='mse',optimizer='adam',metrics=['mse']) history3 = model3.fit(x_train, y_train, batch_size = 16 , epochs= 100 , callbacks=[callback], verbose= 1 ,validation_data=(x_test,y_test)) #Model Tuning 4 model4 =Sequential() model4.add(Conv1D( 16 ,( 3 ),input_shape=( 12 , 1 ),activation ='relu')) model4.add(Flatten()) model4.add(Dense( 64 ,activation='relu')) model4.add(Dense( 1 ,activation='linear')) model4.compile(loss='mse',optimizer='adam',metrics=['mse']) history4 = model4.fit(x_train, y_train, batch_size = 16 , epochs= 100 , callbacks=[callback], verbose= 1 ,validation_data=(x_test,y_test))

plt.legend() plt.figure() plt.plot(epochs, loss2, 'b', label="Train loss") plt.plot(epochs, val_loss2, 'r', label='Test loss') plt.legend() plt.show() #GRAPH 3 - Model Tuning 3 import matplotlib.pyplot as plt mse3=history3.history['mse'] val_mse3=history3.history['val_mse'] loss3=history3.history['loss'] val_loss3=history3.history['val_loss'] epochs = range( 1 , len(mse3)+ 1 ) plt.plot(epochs, mse3, 'b', label="Train MSE") plt.plot(epochs, val_mse3, 'r', label='Test MSE') plt.legend() plt.figure() plt.plot(epochs, loss3, 'b', label="Train loss") plt.plot(epochs, val_loss3, 'r', label='Test loss') plt.legend() plt.show() #GRAPH 4 - Model Tuning 4 import matplotlib.pyplot as plt mse4=history4.history['mse'] val_mse4=history4.history['val_mse'] loss4=history4.history['loss'] val_loss4=history4.history['val_loss'] epochs = range( 1 , len(mse4)+ 1 ) plt.plot(epochs, mse4, 'b', label="Train MSE") plt.plot(epochs, val_mse4, 'r', label='Test MSE') plt.legend() plt.figure() plt.plot(epochs, loss4, 'b', label="Train loss") plt.plot(epochs, val_loss4, 'r', label='Test loss') plt.legend() plt.show()