This is one of many analysis pieces used to create a story on Boston Public School buses tardiness.

Calculate various datasets.

data <- rawdata %>%
  filter(
    errortype == '',
    schtype == 'BPS',
    !is.na(bell_late),
    !is.na(sched_late),
    !is.na(ActualArrivalTime),
    !is.na(PlannedAnchorTime),
    !is.na(belltime)
  ) %>%
  mutate(
    PlannedAnchorTime = hm(PlannedAnchorTime)/minutes(1),
    ActualArrivalTime = hm(ActualArrivalTime)/minutes(1),
    belltime = hm(belltime)/minutes(1),
    plannedBeforeBell = (PlannedAnchorTime <= belltime)
  ) %>%
  mutate(
    date = dmy(date),
    five_late = ((ActualArrivalTime - belltime) + 5),
    is.five.late = plannedBeforeBell & (five_late > 0)
  ) %>%
  filter(plannedBeforeBell)
  
# date, five_late, count
# each row is a count of bus arrivals for that day and minute
lateTrips <- data %>%
  filter(is.five.late) %>%
  group_by(date, five_late) %>%
  tally() %>%
  rename(count = n) %>%
  ungroup() %>%
  arrange(date)

# date, late.trips
# each row is a count of late trips for that day
lateTripsPerDay <- lateTrips %>%
  group_by(date) %>%
  summarise(late.trips = sum(count)) %>%
  arrange(date)

# date, early.trips, late.trips
# each row is a count of total/early/late trips for that day
tripsPerDay <- data %>%
  group_by(date) %>%
  tally() %>%
  rename(total.trips = n) %>%
  inner_join(lateTripsPerDay, by=c('date')) %>%
  mutate(early.trips = total.trips - late.trips) %>%
  arrange(date)

write.csv(lateTrips, 'output/lateTrips.csv', row.names=F)
write.csv(tripsPerDay, 'output/tripsPerDay.csv', row.names=F)

This is a story about buses. Boston Public School buses. On any given day, hundreds of buses carry thousands of kids to school. Here’s September 4, the first day of classes.

tripsPerDay %>%
  mutate(is.first = row_number() == 1) %>%
  head(3) %>%
  ggplot(aes(date, total.trips, alpha=is.first)) +
  geom_bar(stat='identity') +
  theme(legend.position='none') +
  scale_alpha_manual(values=c(0, 1)) +
  ylab('bus trips')


And here’s the rest of the year.

tripsPerDay %>%
  ggplot(aes(date, total.trips)) +
  geom_bar(stat='identity') +
  theme(legend.position='none') +
  ylab('bus trips')


Over 14% of buses arrived late.

data <- melt(tripsPerDay, id=c('date')) %>%
  filter(variable %in% c('early.trips', 'late.trips'))

ggplot(data, aes(date, value, fill=variable, color=variable)) +
  geom_bar(stat='identity') +
  scale_colour_manual(values=c('#ea212d', 'grey80')) +
  theme(legend.position='none') +
  ylab('bus trips')

ggplot(tripsPerDay, aes(date, late.trips)) +
  geom_bar(stat='identity', fill='#ea212d')


On the first day of school, 648 buses show up after the bell.

tripsPerDay %>%
  mutate(is.first = row_number() == 1) %>%
  head(3) %>%
  ggplot(aes(date, late.trips, alpha=is.first)) +
  geom_bar(stat='identity', fill='#ea212d') +
  theme(legend.position='none') +
  scale_alpha_manual(values=c(0, 1))


Most tardy buses arrive about 15 minutes late on the first day of school, but some show up over 90 minutes after the bell.

lateTrips %>%
  filter(date == ymd('2013-09-04')) %>%
  ggplot(aes(five_late, count, five_late)) +
  coord_flip() +
  geom_bar(stat='identity', fill='#ea212d')


Over the school year, as drivers become familiar with their routes, tardiness decreases. But buses continue to arrive late.

lateTrips %>%
  ggplot(aes(date, five_late)) +
  geom_point(colour='#ea212d', size = 1)


What about looking at the first week?

data <- lateTrips %>%
  filter(floor_date(date, 'week') == ymd('2013-09-01'))

write.csv(data, 'output/a1_teaser.csv', row.names=F)

ggplot(data, aes(date, five_late)) +
  geom_point(aes(size=count), colour='#ea212d') +
  geom_point(shape=1, aes(size=count), colour='#59080d') +
  scale_size_area(max_size=9)

lateTrips %>%
  filter(floor_date(date, 'week') == ymd('2013-09-01')) %>%
  ggplot(aes(five_late, count, five_late)) +
  coord_flip() +
  geom_bar(stat='identity', fill='#ea212d') +
  facet_grid(. ~ date)


Are there weekly patterns? In other words, find the tardiness percentage for every day. Then do a histogram, facet by day of week.

tripsPerDay %>%
  mutate(
    day = wday(date),
    pct = 100*late.trips/total.trips
  ) %>%
  select(day, pct) %>%
  ggplot() +
  geom_boxplot(aes(day, pct)) +
  facet_grid(. ~ day)


That doesn’t look useful. Let’s look at tardiness median per day.

tripsPerDay %>%
  mutate(
    day = wday(date),
    pct = 100*late.trips/total.trips
  ) %>%
  select(day, pct) %>%
  group_by(day) %>%
  summarise(median = median(pct)) %>%
  ggplot(aes(day, median)) +
  geom_bar(stat='identity')