Data-Driven Example on using dplyr: Analysis on Airplane Delay

1 Introduction

This is a quick example on how to use dplyer. Suppose you are given a flight duration data,in the following format

library(readr)
library(tidyverse)
library(ggplot2)
library(dplyr)
flight_data <- read_csv("D:/newsite/My_Website/static/data/flight.data.csv")
head(flight_data)
## # A tibble: 6 x 8
##      X1 origin dest  carrier sched_dep_time dep_time sched_arr_time arr_time
##   <dbl> <chr>  <chr> <chr>            <dbl>    <dbl>          <dbl>    <dbl>
## 1     1 LGA    IAH   UA                1859     1900           2159     2204
## 2     2 JFK    BOS   DL                1455     1452           1613     1553
## 3     3 EWR    DEN   WN                1220     1228           1450     1426
## 4     4 EWR    TPA   UA                1422     1417           1715     1714
## 5     5 LGA    DTW   MQ                1315     1305           1520     1459
## 6     6 JFK    FLL   DL                2025     2020           2342     2312

Now, you are wondering what is the the on–time departure percentage by carrier, and the flight durations. Notice, you must make a correction when the destination airport is in a time zone that is not thesame as the origin airport.

Therefore, for getting the correct flight duration, we need some extra information, Here, you would be find enough information for the most airport. Let’s download it and see.

airports <-read_csv("D:/newsite/My_Website/static/data/airports.csv")
head(airports)
## # A tibble: 6 x 14
##     `1` `Goroka Airport` Goroka `Papua New Guin~ GKA   AYGA  `-6.08168983459~
##   <dbl> <chr>            <chr>  <chr>            <chr> <chr>            <dbl>
## 1     2 Madang Airport   Madang Papua New Guinea MAG   AYMD             -5.21
## 2     3 Mount Hagen Kag~ Mount~ Papua New Guinea HGU   AYMH             -5.83
## 3     4 Nadzab Airport   Nadzab Papua New Guinea LAE   AYNZ             -6.57
## 4     5 Port Moresby Ja~ Port ~ Papua New Guinea POM   AYPY             -9.44
## 5     6 Wewak Internati~ Wewak  Papua New Guinea WWK   AYWK             -3.58
## 6     7 Narsarsuaq Airp~ Narss~ Greenland        UAK   BGBW             61.2 
## # ... with 7 more variables: `145.391998291` <dbl>, `5282` <dbl>, `10` <dbl>,
## #   U <chr>, `Pacific/Port_Moresby` <chr>, airport <chr>, OurAirports <chr>

2 Data Cleaning

Obviously, the data needs to be refined and combined. Here is the process:

colnames(airports) <- c(
  "Airport_ID",
  "Name",
  "City",
  "Country",
  "IATA",
  "ICAO",
  "Latitude",
  "Longitude",
  "Altitude",
  "Timezone",
  "DST",
  "Tz_database_time_zone",
  "Type",
  "Source"
)
airports$Airport_ID = airports$Airport_ID - 1

flight_new = mutate(
  flight_data,
  dep_hour = flight_data$dep_time %/% 100,
  dep_mins =  flight_data$dep_time %% 100,
  arrive_hour = flight_data$arr_time %/% 100,
  arrive_mins = flight_data$arr_time %% 100
)

final1 = left_join(flight_new, airports, by = c("origin" = "IATA"))
final2 = left_join(final1, airports, by = c("dest" = "IATA"))

## The function will help to calculate the duration time

flight_time_calculator <-
  function(dep.hour,
           dep.mins,
           dep.UTC,
           arr.hour,
           arr.mins,
           arr.UTC) {
    input = c(dep.hour, dep.mins, dep.UTC, arr.hour, arr.mins, arr.UTC)
    if (any(is.na(input) == TRUE)) {
      duration = c(NA, NA)
    }
    else{
      dep_hour_UTC = dep.hour - dep.UTC
      arr_hour_UTC = arr.hour - arr.UTC
      if (arr_hour_UTC < dep_hour_UTC) {
        hour_final = arr_hour_UTC + 24 - dep_hour_UTC
      }
      else{
        hour_final = arr_hour_UTC - dep_hour_UTC
      }
      
      mins_final = arr.mins - dep.mins
      if (mins_final < 0) {
        hour_final = (hour_final * 60 + mins_final) %/% 60
        mins_final = (hour_final * 60 + mins_final) %% 60
      }
      duration = c(hour_final, mins_final)
      
    }
    return(duration)
  }

final2$duration = rep(0, length(final2$X1))
for (i in 1:length(final2$X1)) {
  duration = flight_time_calculator(
    final2$dep_hour[i],
    final2$dep_mins[i],
    final2$Timezone.x[i],
    final2$arrive_hour[i],
    final2$arrive_mins[i],
    final2$Timezone.y[i]
  )
  final2$duration[i] = duration[1] * 60 + duration[2]
}

## see the duration in the last column
head(final2)
## # A tibble: 6 x 39
##      X1 origin dest  carrier sched_dep_time dep_time sched_arr_time arr_time
##   <dbl> <chr>  <chr> <chr>            <dbl>    <dbl>          <dbl>    <dbl>
## 1     1 LGA    IAH   UA                1859     1900           2159     2204
## 2     2 JFK    BOS   DL                1455     1452           1613     1553
## 3     3 EWR    DEN   WN                1220     1228           1450     1426
## 4     4 EWR    TPA   UA                1422     1417           1715     1714
## 5     5 LGA    DTW   MQ                1315     1305           1520     1459
## 6     6 JFK    FLL   DL                2025     2020           2342     2312
## # ... with 31 more variables: dep_hour <dbl>, dep_mins <dbl>,
## #   arrive_hour <dbl>, arrive_mins <dbl>, Airport_ID.x <dbl>, Name.x <chr>,
## #   City.x <chr>, Country.x <chr>, ICAO.x <chr>, Latitude.x <dbl>,
## #   Longitude.x <dbl>, Altitude.x <dbl>, Timezone.x <dbl>, DST.x <chr>,
## #   Tz_database_time_zone.x <chr>, Type.x <chr>, Source.x <chr>,
## #   Airport_ID.y <dbl>, Name.y <chr>, City.y <chr>, Country.y <chr>,
## #   ICAO.y <chr>, Latitude.y <dbl>, Longitude.y <dbl>, Altitude.y <dbl>,
## #   Timezone.y <dbl>, DST.y <chr>, Tz_database_time_zone.y <chr>, Type.y <chr>,
## #   Source.y <chr>, duration <dbl>

3 Results

Hence, we can know the durations of the five longest flights are:

final2_sort = final2[with(final2, order(-final2$duration)),]
print(final2_sort[1:5, c(2:3,39)])
## # A tibble: 5 x 3
##   origin dest  duration
##   <chr>  <chr>    <dbl>
## 1 JFK    HNL        695
## 2 JFK    HNL        693
## 3 JFK    HNL        673
## 4 JFK    HNL        604
## 5 EWR    HNL        577

As you can see the flights are all to Hawii. Now let’s find out which flight company has the highest on-time flight rate

z1 = group_by(final2_sort,carrier)

(per_carrier =summarise(z1,on_time_rate=mean(1 * (dep_time - sched_dep_time < 15),na.rm = TRUE)))
## # A tibble: 15 x 2
##    carrier on_time_rate
##    <chr>          <dbl>
##  1 9E             0.754
##  2 AA             0.868
##  3 AS             0.667
##  4 B6             0.771
##  5 DL             0.780
##  6 EV             0.599
##  7 F9             0.5  
##  8 FL             0.75 
##  9 HA             1    
## 10 MQ             0.740
## 11 UA             0.665
## 12 US             0.875
## 13 VX             0.556
## 14 WN             0.778
## 15 YV             1
p = ggplot(per_carrier,
           aes(
             x = reorder(per_carrier$carrier,-per_carrier$on_time_rate),
             y = per_carrier$on_time_rate,
           )) + geom_bar(stat="identity")
p +ggtitle(" Flight On-time Status") + labs(y = "On Time Rate",x= "Carrier")

comments powered by Disqus
Next
Previous