Overview

The goal in this analysis to:

  • examine basic jackpot and sales data for the Mega Million lottery since October 2017 (when current rules went into effect)
  • calculate the likelihood of there being a split jackpot, depending on how many people bought tickets
  • visualize how the annuity value and cash value of the jackpot have changed over time
  • calculate how much winnings are reduced by federal and state (CA) taxes
  • calculate the expected value of every Mega Millions tickets since October 2017, after adjusting for:
    • cash value of ticket
    • taxes
    • likelihood of splitting the pot

— Settings —

knitr::opts_chunk$set(echo = TRUE)
plot_save <- FALSE

— Packages —

library(readxl)
library(tidyverse)
library(lubridate)
library(binom)

— Load Data —

1. Read

lott <- read_xlsx("lottery_sales_data.xlsx")

2. View

View(lott)

3. Types

# examine cols&types
t(data.frame(lapply(lott, class)))
##          [,1]      [,2]     
## date     "POSIXct" "POSIXt" 
## sales_mm "numeric" "numeric"
## sales_jj "numeric" "numeric"
## jackpot  "numeric" "numeric"

— Analysis —

1. Basic Shaping & Viz

The goal here is to just get the data into a nice, workable format. And to do some quick and dirty, cursory visualizations, to get a sense of what the main variables of interest (jackpot sizes, and total sales) look like.

1.1. Shape

lott_clean <-
lott %>% 
  mutate(jackpot = jackpot * 1000000,
         sales = sales_mm + sales_jj)

lott_clean

1.2. Graph: Just Jackpot

Graph of jackpot sizes over time. Axes and everything still ugly.

ggplot(data = lott_clean,
       aes(x = date,
           y = jackpot)) +
  geom_point() +
  geom_line()

## 1.2. Graph: Just Jackpot (NICE)

color_jackpot <- "green4"

plot_just_sales <-
lott_clean %>% 
  mutate(jackpot = jackpot / 1e6,
         sales = sales / 1e6) %>% 
ggplot(aes(x = as.Date(date))) +
  geom_line(aes(y = jackpot),
            color = color_jackpot,
            size = 2,
            alpha = 0.5) +
  scale_x_date(name = "Date",
               date_labels = "%Y",
               date_breaks = "1 year") +
  scale_y_continuous(labels = c("$0", paste0("$", seq(1, 9, 1)*100, "m"), paste0("$", seq(1, 1.6, 0.1), "b")),
                     breaks = c(0, 1:16*100),
                     name = "Jackpot Value \n (m=million, b=billion)") +
  labs(title = "Jackpot Size Over Time") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 15),
        axis.text.y.left = element_text(color = color_jackpot,
                                        size = 15),
        axis.title.y.left = element_text(color = color_jackpot,
                                         size = 15,
                                         margin = margin(t = 0, r = 10, b = 0, l = 0)),
        axis.text.x = element_text(angle = 0)
        )
plot_just_sales

## 1.2. Save

if (plot_save){
ggsave(plot = plot_just_sales,
       filename = "jackpot_sales_only.pdf",
       width = 15,
       height = 7)
}

1.3. Jackpot Percentiles and Mean

Jackpot sizes at various percentiles (in millions of dollars)

quantile(round(lott_clean$jackpot/1e6, digits = 1), probs = c(0, 0.01, 0.05, seq(0.1, 0.9, 0.1), 0.95, 0.99, 1))
##     0%     1%     5%    10%    20%    30%    40%    50%    60%    70%    80% 
##   20.0   20.7   40.0   45.0   55.0   75.0   97.0  125.5  160.0  205.0  265.0 
##    90%    95%    99%   100% 
##  354.0  427.0  652.3 1537.0
mean(lott_clean$jackpot)/1e6
## [1] 170.625

1.4. Graph: Sales

Graph of total sales figures over time. Axes and everything still ugly.

ggplot(data = lott_clean,
       aes(x = date,
           y = sales)) +
  geom_point() +
  geom_line()

1.5. Sales Percentiles (in millions)

Total ticket sales at various percentiles.

quantile(round(lott_clean$sales/1e6, digits = 1), probs = c(0, 0.01, 0.05, seq(0.1, 0.9, 0.1), 0.95, 0.99, 1))
##      0%      1%      5%     10%     20%     30%     40%     50%     60%     70% 
##  16.000  16.435  17.800  18.400  19.400  20.650  22.300  24.200  26.000  29.100 
##     80%     90%     95%     99%    100% 
##  34.900  48.100  69.700 196.855 740.100

Average

mean(lott_clean$sales)/1e6
## [1] 34.73495

1.6. Correlation

Just calculate the correlation between jackpot size and total sales. Very high.

cor(lott_clean$sales, lott_clean$jackpot)
## [1] 0.8135666

1.7. Basic Linear Model

And the same relationship, just as a regression.

m1 <- lm(data = lott_clean,
         formula = sales ~ jackpot)

summary(m1)
## 
## Call:
## lm(formula = sales ~ jackpot, data = lott_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -57774557 -15728933   2483981  16127021 317148914 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.374e+07  2.558e+06  -5.374 1.45e-07 ***
## jackpot      2.841e-01  1.111e-02  25.570  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31470000 on 334 degrees of freedom
## Multiple R-squared:  0.6619, Adjusted R-squared:  0.6609 
## F-statistic: 653.8 on 1 and 334 DF,  p-value: < 2.2e-16

2. Viz: Sales & Jackpot Figures

The goal here is to create a pretty plot that shows both jackpot sizes and total sales together (using two y-axes), in a way that captures how much they move together.

Resources:

2.1. Graph: w/ Outlier

coeff <- max(lott_clean$jackpot/1e6)/max(lott_clean$sales/1e6)
color_jackpot <- "green4"
color_sales <- "grey32"

plot_2.1 <-
lott_clean %>% 
  mutate(jackpot = jackpot / 1e6,
         sales = sales / 1e6) %>% 
ggplot(aes(x = as.Date(date))) +
  geom_line(aes(y = jackpot),
            color = color_jackpot,
            size = 2,
            alpha = 0.5) +
  geom_line(aes(y = sales * coeff),
            color = color_sales,
            size = 1.25,
            alpha = 0.5) +
  scale_x_date(name = "Date",
               date_labels = "%Y",
               date_breaks = "1 year") +
  scale_y_continuous(labels = c("$0", paste0("$", seq(1, 9, 1)*100, "m"), paste0("$", seq(1, 1.6, 0.1), "b")),
                     breaks = c(0, 1:16*100),
                     name = "Jackpot Value \n (m=million, b=billion)",
                     sec.axis = sec_axis(trans = ~.*(1/coeff),
                                       name = "Tickets Sold (millions)",
                                       breaks = c(0, seq(0, 8, 0.5)*100))) +
  labs(title = "Jackpot Size & Ticket Sales, Over Time") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 17),
        axis.text.y.left = element_text(color = color_jackpot,
                                        size = 15),
        axis.ticks.y.left = element_line(color = color_jackpot),
        axis.title.y.left = element_text(color = color_jackpot,
                                         size = 15,
                                         margin = margin(t = 0, r = 10, b = 0, l = 0)),
        axis.title.y.right = element_text(margin = margin(t = 0, r = 0, b = 0, l = 10),
                                          size = 15,
                                          color = color_sales),
        axis.text.y.right = element_text(size = 15,
                                         color = color_sales),
        axis.ticks.y.right = element_line(color = color_sales)
        )

plot_2.1

2.1. Save

if (plot_save){
ggsave(plot = plot_2.1,
       filename = "jackpot_sales_full.pdf",
       width = 15,
       height = 7)
}

3. Split Pot

The goal in this section is to calculate the odds of splitting the lottery jackpot, as a function of how any other people have bought tickets. First, I will make a function to do these calculations. Then I will implement the function on actual sales data.

3.1. Function: odds of split pot

inputs:

  • number of potential winners
  • number of tickets sold
  • probability of winning jackpot

output:

  • odd of this many people splitting the pot

See Bill Butler’s website for formal calculations. (It’s just the binomial distribution.)

prob_k <- function(num_winners, num_tickets, prob_win){
  #return(choose(n = num_tickets, k = num_winners) * prob_win^num_winners * (1-prob_win)^(num_tickets-num_winners))
  return(dbinom(size = num_tickets, x = num_winners, prob = prob_win)) #(same as above)
}

3.1. Test

Test cases for function (long endless scroll of output not displayed in Rmd).

print("---1---")
(1/302575350)
print("---2---")
prob_k(num_winners = 1, num_tickets = 1, prob_win = (1/302575350))
prob_k(num_winners = 1, num_tickets = 2, prob_win = (1/302575350))
print("---3---")
prob_k(num_winners = 1, num_tickets = 2, prob_win = (1/302575350))/prob_k(num_winners = 1, num_tickets = 1, prob_win = (1/302575350))
prob_k(num_winners = 1, num_tickets = 3, prob_win = (1/302575350))/prob_k(num_winners = 1, num_tickets = 1, prob_win = (1/302575350))
prob_k(num_winners = 1, num_tickets = 4, prob_win = (1/302575350))/prob_k(num_winners = 1, num_tickets = 1, prob_win = (1/302575350))
print("---4---")
prob_k(num_winners = 2, num_tickets = 2, prob_win = (1/302575350))/prob_k(num_winners = 2, num_tickets = 2, prob_win = (1/302575350))
prob_k(num_winners = 2, num_tickets = 3, prob_win = (1/302575350))/prob_k(num_winners = 2, num_tickets = 2, prob_win = (1/302575350))
prob_k(num_winners = 2, num_tickets = 4, prob_win = (1/302575350))/prob_k(num_winners = 2, num_tickets = 2, prob_win = (1/302575350))

3.2. Function: EV, w/ split pot

inputs:

  • jackpot size (can be either cash value or annuity value)
  • number of tickets sold
  • max number of potential winners to consider
  • probability of winning jackpot
  • (also: method of computing probability, summing or inverse of non-winners)
  • (also: whether pot is to be split)

outputs:

  • expected value of a ticket (incorporating odds and payout if split ticket)
ev_split <- function(jackpot, tickets, max_winners, prob_win, method, split){
  
  # 
  if (method == "inverse" & split == TRUE){
    ev <- jackpot * (1 - prob_k(num_winners = 0, num_tickets = tickets, prob_win = prob_win))
  }
  
  if (method == "inverse" & split == FALSE){
    return("nah, this combo (inverse & no split) doesn't make sense")
  }
  
  if (method == "sum"){
    ev <- 0
    for (n in 1:max_winners){
      if (split == TRUE){
      ev <- ev + ((jackpot/n)*n * prob_k(num_winners = n, num_tickets = tickets, prob_win = prob_win))
      #ev <- ev + ((jackpot/n)*n * dbinom(size = n, x = tickets, prob = prob_win)) #(same as above)
      }
      if (split == FALSE){
      ev <- ev + ((jackpot/1)*n * prob_k(num_winners = n, num_tickets = tickets, prob_win = prob_win))
      }
    }
  }
  
  return(ev/tickets)
  
}

3.2. Test

Test cases for function (long endless scroll of output not displayed in Rmd).

3.3. Calculate

calculate odds of split pot, as function of number of tickets sold

Plan: * loop from 1 million to 1 billion ticket sales, by million * loop from 0 winners to 10 winners * calculate the odds of winning, for each combination * store results

# set values to loop through
loop_winners <- 0:10
loop_tickets <- 1:1000*1e6
total_loop <- length(loop_winners) * length(loop_tickets)

# create data frame, to store results
dat_split <- data.frame(num_winners = integer(length = total_loop),
                        num_tickets = integer(length = total_loop),
                        prob = double(length = total_loop))


# loop through all values, calculate odds, and store
index <- 0
for (num_winners_i in loop_winners){
  for (num_tickets_k in loop_tickets){
    index <- index + 1
    
    dat_split$num_winners[index] <- num_winners_i
    dat_split$num_tickets[index] <- num_tickets_k
    dat_split$prob[index] <- prob_k(num_winners = num_winners_i,
                                    num_tickets = num_tickets_k,
                                    prob_win = (1/302575350))
    
  }
}

3.4. View Results (Tabular)

dat_split_mod <-
dat_split %>% 
  arrange(num_tickets, num_winners) %>% 
  pivot_wider(names_from = num_winners,
              values_from = prob) %>% 
  rename(prob_0 = "0", prob_1 = "1", prob_2 = "2", prob_3 = "3", prob_4 = "4",
         prob_5 = "5", prob_6 = "6", prob_7 = "7", prob_8 = "8", prob_9 = "9", prob_10 = "10") %>% 
  mutate(prob_5_plus = 1 - (prob_0 + prob_1 + prob_2 + prob_3 + prob_4)) %>%
  #mutate(check_1 = prob_0 + prob_1 + prob_2 + prob_3 + prob_4 + prob_5,
  #       check_2 = prob_0 + prob_1 + prob_2 + prob_3 + prob_4,
  #       check_3 = prob_0 + prob_1 + prob_2 + prob_3 + prob_4 + prob_6)
  select(num_tickets, prob_0, prob_1, prob_2, prob_3, prob_4, prob_5_plus, everything()) %>% 
  pivot_longer(cols = c("prob_0", "prob_1", "prob_2", "prob_3", "prob_4", "prob_5_plus"),
               names_to = "num_winners",
               values_to = "prob") %>% 
  select(num_tickets, num_winners, prob) %>% 
  mutate(num_winners = case_when(num_winners == "prob_0" ~ "None",
                                 num_winners == "prob_1" ~ "One",
                                 num_winners == "prob_2" ~ "Two",
                                 num_winners == "prob_3" ~ "Three",
                                 num_winners == "prob_4" ~ "Four",
                                 num_winners == "prob_5_plus" ~ "Five or More"),
         num_winners = factor(num_winners,
                              levels = c("None", "One", "Two", "Three", "Four", "Five or More")))

dat_split_mod

3.5. Probability Sums

  • we can see that, the first ten winners, even at 1 billion tickets, account for more than 99.9% of winners
  • (i.e. we gain little from further incorporating odds of 11, 12, 13+ winners)
dat_split %>% 
  arrange(num_tickets, num_winners) %>% 
  group_by(num_tickets) %>% 
  summarize(prob_sum = sum(prob))
## `summarise()` ungrouping output (override with `.groups` argument)

3.6. Graph: Split Pot

Going to graph this with:

  • full range of number of winners considered (0, 1, 2, 3, 4, 5 or more)
  • zero winners
  • 1 winner

The latter one are for didactic/explanatory purposes.

dat_split_mod %>%
  ggplot(aes(x = num_tickets,
             y = prob,
             color = num_winners)) +
  geom_line(size = 1.25,
            alpha = 0.5) +
  scale_x_continuous(labels = seq(0, 1000, 100),
                     breaks = seq(0, 1000, 100)*1e6) +
  scale_y_continuous(breaks = seq(0, 1, 0.1)) +
  scale_color_manual(values = c("red", "green4", "blue4", "dodgerblue2", "skyblue", "turquoise1")) +
  labs(x = "Number of Tickets Sold \n (millions)",
       y = "Probability of This Many Winners",
       title = "Odds of a Split Pot v. Number of Tickets Sold",
       color = "Numbers of Winners") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 0,
                                   size = 10),
        axis.title.x = element_text(margin = margin(t = 10, b = 0, r = 0, l = 0),
                                    size = 12),
        axis.title.y = element_text(margin = margin(t = 0, b = 0, r = 10, l = 0),
                                    size = 12),
        plot.title = element_text(hjust = 0.5,
                                  size = 15),
        legend.position = "right")

dat_split_mod %>%
  filter(num_winners == "None") %>% 
  ggplot(aes(x = num_tickets,
             y = prob,
             color = num_winners)) +
  geom_line(size = 1.25,
            alpha = 0.5) +
  scale_x_continuous(labels = seq(0, 1000, 100),
                     breaks = seq(0, 1000, 100)*1e6) +
  scale_y_continuous(breaks = seq(0, 1, 0.1)) +
  scale_color_manual(values = c("red", "green4", "blue4", "dodgerblue2", "skyblue", "turquoise1")) +
  labs(x = "Number of Tickets Sold \n (millions)",
       y = "Probability of This Many Winners",
       title = "",
       color = "Numbers of Winners") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 0,
                                   size = 10),
        axis.title.x = element_text(margin = margin(t = 10, b = 0, r = 0, l = 0),
                                    size = 12),
        axis.title.y = element_text(margin = margin(t = 0, b = 0, r = 10, l = 0),
                                    size = 12),
        plot.title = element_text(hjust = 0.5,
                                  size = 15),
        legend.position = "right")

dat_split_mod %>%
  filter(num_winners == "None" | num_winners == "One") %>% 
  ggplot(aes(x = num_tickets,
             y = prob,
             color = num_winners)) +
  geom_line(size = 1.25,
            alpha = 0.5) +
  scale_x_continuous(labels = seq(0, 1000, 100),
                     breaks = seq(0, 1000, 100)*1e6) +
  scale_y_continuous(breaks = seq(0, 1, 0.1),
                     limits = c(0, 1)) +
  scale_color_manual(values = c("red", "green4")) +
  labs(x = "Number of Tickets Sold \n (millions)",
       y = "Probability of This Many Winners",
       title = "",
       color = "Numbers of Winners") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 0,
                                   size = 10),
        axis.title.x = element_text(margin = margin(t = 10, b = 0, r = 0, l = 0),
                                    size = 12),
        axis.title.y = element_text(margin = margin(t = 0, b = 0, r = 10, l = 0),
                                    size = 12),
        plot.title = element_text(hjust = 0.5,
                                  size = 15),
        legend.position = "right")

3.6. Graph: Split Pot (w/ overlay)

plot_3.6 <-
dat_split_mod %>%
  ggplot(aes(x = num_tickets,
               y = prob,
               color = num_winners)) +
    geom_line(size = 1.25,
              alpha = 0.5) +
    scale_x_continuous(labels = seq(0, 1000, 100),
                       breaks = seq(0, 1000, 100)*1e6) +
    scale_y_continuous(breaks = seq(0, 1, 0.1)) +
    scale_color_manual(values = c("red", "green4", "blue4", "dodgerblue2", "skyblue", "turquoise1")) +
    labs(x = "Number of Tickets Sold \n (millions)",
         y = "Probability of This Number of Winners",
         title = "Odds of a Split Pot v. Number of Tickets Sold",
         color = "Numbers of Winners") +
  annotate("rect",
         xmin = min(lott_clean$sales),
         xmax = quantile(lott_clean$sales, prob = 0.50),
         ymin = 0,
         ymax = 1,
         fill = "blue",
         alpha = 0.2) +
  annotate("rect",
         xmin = min(lott_clean$sales),
         xmax = quantile(lott_clean$sales, prob = 0.90),
         ymin = 0,
         ymax = 1,
         fill = "blue",
         alpha = 0.2) + 
  annotate("rect",
         xmin = min(lott_clean$sales),
         xmax = quantile(lott_clean$sales, prob = 0.99),
         ymin = 0,
         ymax = 1,
         fill = "blue",
         alpha = 0.2) + 
  annotate("text",
           x = quantile(lott_clean$sales, prob = 0.50),
           y = 0.5,
           label = "50th percentile",
           angle = 90,
           size = 4) +
  annotate("text",
         x = quantile(lott_clean$sales, prob = 0.90),
         y = 0.5,
         label = "90th percentile",
         angle = 90,
         size = 4) +
  annotate("text",
       x = quantile(lott_clean$sales, prob = 0.99),
       y = 0.5,
       label = "99th percentile",
       angle = 90,
       size = 4) +
  geom_vline(xintercept = max(lott_clean$sales)) +
  annotate("text",
           x = max(lott_clean$sales) + 5e6,
           y = 0.5,
           label = "maximum sales ever",
           angle = 90,
           size = 4) +
  geom_vline(xintercept = min(lott_clean$sales)) +
  annotate("text",
           x = min(lott_clean$sales) - 10e6,
           y = 0.5,
           label = "minimum sales ever",
           angle = 90,
           size = 4) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 0,
                                     size = 10),
          axis.title.x = element_text(margin = margin(t = 10, b = 0, r = 0, l = 0),
                                      size = 12),
          axis.title.y = element_text(margin = margin(t = 0, b = 0, r = 10, l = 0),
                                      size = 12),
          plot.title = element_text(hjust = 0.5,
                                    size = 15),
          legend.position = "right")
  
plot_3.6

3.6. Save

if (plot_save){
ggsave(plot = plot_3.6,
       filename = "split_pot_overlay.pdf",
       width = 15,
       height = 7)
}

4. Cash Value

The goal here is to figure out the cash value of various jackpots, from their advertised (i.e. annuity) amounts. Can’t find a super clear answer (at least to me on this). Below are the more details written state rules/laws for the lottery.

But honestly, I don’t really understand these, so will just look at historical records of jackpot size, and compare advertised (annuity) amount of the jackpot, to the stated equivalent cash prize.

Used, wayback machine for this:

(Seems like the date of the screenshots are kind of out of wack with the date of the drawing. But when you click on the link, it gives you jackpot, cash, and date of drawing, so those are all correctly linked. And that is what I am recording, date of drawing, not date of screenshot.)

Method: clicked through at least one screenshot on each month, in the years 2021 to 2017 (only up to October 31, when rules changed). Not every month, while having a screen shot had a unique drawing value. So just, noted where there was a unique drawing value. And noted that. Not perfectly rigorously. Fully rigorous method could do a full scrape from here.

4.1. Enter Data

cash <-
rbind(
c(25e6, 18.6e6, "2021-01-29"),
c(20e6, 14.7e6, "2021-01-26"),
c(850e6, 628.2e6, "2021-01-19"),
c(510e6, 377.4e6, "2021-01-08"),
c(401e6, 305.4e6, "2021-01-01"),
c(310e6, 238e6, "2020-12-18"),
c(57e6, 45.4e6, "2020-08-25"),
c(22e6, 18e6, "2020-08-07"),
c(101e6, 76.5e6, "2020-03-24"),
c(65e6, 48.6e6, "2020-03-03"),
c(202e6, 142e6, "2020-02-11"),
c(266e6, 182.3e6, "2019-12-03"),
c(418e6, 263.3e6, "2019-05-28"),
c(316e6, 195.5e6, "2019-05-14"),
c(305e6, 181e6, "2018-12-18"),
c(208e6, 119e6, "2018-12-04"),
c(122e6, 69e6, "2018-11-16"),
c(106e6, 59e6, "2018-11-13"),
c(90e6, 50e6, "2018-11-09"),
c(52e6, 29e6, "2018-11-02"),
c(40e6, 22e6, "2018-10-26"),
c(45e6, 25e6, "2018-10-30"),
c(1600e6, 913e6, "2018-10-23"),
c(1000e6, 565e6, "2018-10-19"),
c(470e6, 265e6, "2018-10-09"),
c(405e6, 235e6, "2018-10-05"),
c(367e6, 213e6, "2018-10-02"),
c(252e6, 148e6, "2018-09-18"),
c(187e6, 111e6, "2018-09-07"),
c(88e6, 52e6, "2018-08-17"),
c(50e6, 29e6, "2018-08-03"),
c(306e6, 185e6, "2018-07-10"),
c(144e6, 85e6, "2018-06-12"),
c(84e6, 49e6, "2018-05-29"),
c(143e6, 84e6, "2018-05-04"),
c(80e6, 47e6, "2018-04-20"),
c(40e6, 24e6, "2018-04-03"),
c(502e6, 301e6, "2018-03-30"),
c(318e6, 187e6, "2018-03-13"),
c(290e6, 172e6, "2018-03-09"),
c(243e6, 143e6, "2018-03-02"),
c(204e6, 120e6, "2018-02-23"),
c(185e6, 109e6, "2018-02-20"),
c(136e6, 81e6, "2018-02-09"),
c(63e6, 38e6, "2018-01-23"),
c(50e6, 30e6, "2018-01-16"),
c(40e6, 25e6, "2018-01-09"),
c(361e6, 225e6, "2018-01-02"),
c(306e6, 191e6, "2017-12-29"),
c(277e6, 172e6, "2017-12-26"),
c(247e6, 155e6, "2017-12-22"),
c(208e6, 130e6, "2017-12-15"),
c(160e6, 100e6, "2017-12-05"),
c(145e6, 91e6, "2017-12-01"),
c(106e6, 66e6, "2017-11-21"),
c(82e6, 51e6, "2017-11-14"),
c(71e6, 44e6, "2017-11-10"),
c(59e6, 36e6, "2017-11-07"),
c(48e6, 29e6, "2017-11-03")
)

4.2. Reshape Data

# name columns
colnames(cash) <- c("advertised", "cash", "date")

# reshape data
cash <-
data.frame(cash) %>% 
  mutate(advertised = as.integer(advertised),
         cash = as.integer(cash),
         date = as.Date(date),
         perc = cash/advertised*100)

# view data
cash

4.3. Graph: Annuity & Percentage

coeff <- max(cash$advertised)/max(cash$perc)
color_advertised <- "dodgerblue4"
color_perc <- "black"

plot_4.3 <-
cash %>% 
ggplot(aes(x = date)) +
  geom_line(aes(y = advertised),
            color = color_advertised,
            size = 2,
            alpha = 0.5) +
  geom_point(aes(y = perc * coeff),
             color = color_perc) +
  geom_line(aes(y = perc * coeff),
            color = color_perc,
            size = 1,
            alpha = 0.5) +
  scale_x_date(name = "Date",
               date_labels = "%Y (%b)",
               date_breaks = "1 month") +
  scale_y_continuous(labels = scales::label_dollar(),
                     breaks = c(0, 1:17*100e6),
                     name = "Advertised Jackpot Value (Annuity)",
                     sec.axis = sec_axis(trans = ~.*(1/coeff),
                                       name = "Cash Value (as % of Advertised Jackpot)",
                                       breaks = seq(0, 100, 10))) +
  labs(title = "Advertised Jackpot Size & Cash Value of Jackpot, Over Time") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.y.left = element_text(color = color_advertised, size = 10),
        axis.title.y.left = element_text(color = color_advertised),
        axis.title.y.right = element_text(margin = margin(t = 0, r = 0, b = 0, l = 10)),
        axis.text.x = element_text(angle = 90),
        panel.grid.minor = element_blank())

plot_4.3

## 4.3. Save

if (plot_save){
ggsave(plot = plot_4.3,
       filename = "cash_jackpot.pdf",
       width = 15,
       height = 7)
}

4.4. Graph: Annuity, Cash, & Percentage

coeff <- max(cash$advertised)/max(cash$perc)
color_advertised <- "black"
color_perc <- "steelblue3" #"dodgerblue4"

plot_4.4 <-
cash %>% 
  pivot_longer(cols = c("advertised", "cash"),
               names_to = "claim_method",
               values_to = "value") %>% 
  mutate(claim_method = case_when(claim_method == "advertised" ~ "Advertised/Annuity Value",
                                  claim_method == "cash" ~ "Cash Value")) %>% 
ggplot(aes(x = date)) +
  geom_line(aes(y = value,
                color = claim_method),
            size = 2,
            alpha = 0.5) +
  geom_point(aes(y = perc * coeff),
             color = color_perc) +
  geom_line(aes(y = perc * coeff),
            color = color_perc,
            size = 1,
            alpha = 0.5) +
  scale_x_date(name = "Date",
               date_labels = "%Y",
               date_breaks = "1 year") +
  scale_y_continuous(labels = c("$0",
                                paste0("$", seq(1, 9, 1)*100, "m"),
                                paste0("$", seq(1, 1.6, 0.1), "b")),
                     breaks = c(0, 1:16*100e6),
                     name = "Jackpot Value",
                     sec.axis = sec_axis(trans = ~.*(1/coeff),
                                       name = "Cash Value (as % of Advertised/Annuity Value)",
                                       breaks = seq(0, 100, 10))) +
  scale_color_manual(values = c("green4", "darkorchid3")) +
  labs(title = "Jackpot Size, Over Time",
       color = "") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 15),
        axis.text.y.left = element_text(color = color_advertised,
                                        size = 10),
        axis.text.y.right = element_text(color = color_perc,
                                        size = 10),
        axis.title.y.left = element_text(color = color_advertised,
                                         margin = margin(t = 0, r = 10, b = 0, l = 0),
                                         size = 15),
        axis.title.y.right = element_text(margin = margin(t = 0, r = 0, b = 0, l = 10),
                                          size = 15,
                                          color = color_perc),
        axis.ticks.y.right = element_line(color = color_perc),
        axis.text.x = element_text(angle = 0),
        panel.grid.minor = element_blank(),
        legend.position = "top")

plot_4.4

## 4.4. Save

if (plot_save){
ggsave(plot = plot_4.4,
       filename = "cash_jackpot_bothlines.pdf",
       width = 15,
       height = 7)
}

4.5. Percentiles

quantile(x = cash$perc,
         probs = c(0, 0.01, 0.05, 0.1, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99, 1))
##       0%       1%       5%      10%      25%      50%      75%      90% 
## 55.00000 55.32222 55.64990 56.47660 58.73571 60.00000 62.62652 74.47385 
##      95%      99%     100% 
## 76.22106 80.56013 81.81818

4.6. Median, Each Year

cash %>% 
  mutate(year = year(date)) %>% 
  group_by(year) %>% 
  summarise(median = median(perc))
## `summarise()` ungrouping output (override with `.groups` argument)

4.7. SET: Cash Value %

This is where I will set coefficient (proportion) to multiply advertised (annuity) jackpot values by. Since I don’t have the same number of observations for every year, I will just take the mean of the median cash percent for each year.

cash_perc <-
  cash %>% 
    mutate(year = year(date)) %>% 
    group_by(year) %>% 
    summarize(median = median(perc)) %>% 
    summarize(mean_median = mean(median))
## `summarise()` ungrouping output (override with `.groups` argument)
cash_perc
cash_coeff <- cash_perc$mean_median/100
cash_coeff
## [1] 0.668581

5. Taxes

5.1. Function: federal taxes

Note: I used kind of recursive functions to calculate taxes here. This is not the most computationally efficient thing to do here. (For any income, you should only need to do 2 operations: add fixed sum to pay from brackets below and proportion for top bracket. But that at least requires a one time cost of calculating the fixed values at every bracket. This recursive function (1) is more fun to think about, and (2) avoid having to calculate and manually type in the fixed sum for any given bracket. I just typed in the income brackets.)

# recursive function to calculate taxes
fed_taxes <- function(income){

  if (income <= 0) {
    return(0)
  }
  if (income > 518400){
    return(0.37 * (income - 518400) + fed_taxes(518400))
  }
  if (income <= 518400 & income > 207350){
    return(0.35 * (income - 207350) + fed_taxes(207350))
  }
  if (income <= 207350 & income > 163300){
    return(0.32 * (income - 163300) + fed_taxes(163300))
  }
  if (income <= 163300 & income > 85525){
    return(0.24 * (income - 85525) + fed_taxes(85525)) 
  }
  if (income <= 85525 & income > 40125){
    return(0.22 * (income - 40125) + fed_taxes(40125))
  }
  if (income <= 40125 & income > 9875){
    return(0.12 * (income - 9875) + fed_taxes(9875))
  }
  if (income <= 9875 & income > 0){
    return(0.10 * income)
  }
}

5.1. Test

Test cases for function (long endless scroll of output not displayed in Rmd).

fed_taxes(-100)
fed_taxes(0)
fed_taxes(1)
fed_taxes(9874)
fed_taxes(9875)
fed_taxes(9876)
fed_taxes(40124)
fed_taxes(40125)
fed_taxes(40126)
fed_taxes(85524)
fed_taxes(85525)
fed_taxes(85526)
fed_taxes(163299)
fed_taxes(163300)
fed_taxes(163301)
fed_taxes(207349)
fed_taxes(207350)
fed_taxes(207351)
fed_taxes(518399)
fed_taxes(518400)
fed_taxes(518401)
fed_taxes(1e6)
fed_taxes(10e6)
fed_taxes(100e6)
fed_taxes(1000e6)

fed_taxes(50e3)
fed_taxes(100e3)

5.2. Function: state taxes

Initially was going to calculate taxes for California residents. BUt apparently California residents are exempt from paying state income tax on lottery winnings. Sources:

Going to implement state income tax rates from New York:

California Tax Brackets (for reference):

# recursive function to calculate taxes
state_taxes <- function(income){
  
  state <- "NY" # create internal state variable (not passed to function bc then have to rewrite a bunch of other code below)
  
  if (state == "NY"){
      if (income <= 0) {
        return(0)
      }
      if (income > 1077550){
        return((0.0882) * (income - 1077550) + state_taxes(1077550))
      }
      if (income <= 1077550 & income > 215400){
        return(0.0685 * (income - 215400) + state_taxes(215400))
      }
      if (income <= 215400 & income > 80650){
        return(0.0641 * (income - 80650) + state_taxes(80650))
      }
      if (income <= 80650 & income > 21400){
        return(0.0609 * (income - 21400) + state_taxes(21400))
      }
      if (income <= 21400 & income > 13900){
        return(0.059 * (income - 13900) + state_taxes(13900))
      }
      if (income <= 13900 & income > 11700){
        return(0.0525 * (income - 11700) + state_taxes(11700))
      }
      if (income <= 11700 & income > 8500){
        return(0.045 * (income - 8500) + state_taxes(8500))
      }
      if (income <= 8500 & income > 0){
        return(0.04 * income)
      }
  }
  
  # just leaving this here, in case I ever want to use it
  
  if (state == "CA"){
      if (income <= 0) {
        return(0)
      }
      if (income > 1e6){
        return((0.123+0.01) * (income - 1e6) + state_taxes(1e6))
      }
      if (income <= 1e6 & income > 599012){
        return(0.123 * (income - 599012) + state_taxes(599012))
      }
      if (income <= 599012 & income > 359407){
        return(0.113 * (income - 359407) + state_taxes(359407))
      }
      if (income <= 359407 & income > 299508){
        return(0.103 * (income - 299508) + state_taxes(299508))
      }
      if (income <= 299508 & income > 58634){
        return(0.093 * (income - 58634) + state_taxes(58634))
      }
      if (income <= 58634 & income > 46394){
        return(0.08 * (income - 46394) + state_taxes(46394))
      }
      if (income <= 46394 & income > 33421){
        return(0.06 * (income - 33421) + state_taxes(33421))
      }
      if (income <= 33421 & income > 21175){
        return(0.04 * (income - 21175) + state_taxes(21175))
      }
      if (income <= 21175 & income > 8932){
        return(0.02 * (income - 8932) + state_taxes(8932))
      }
      if (income <= 8932 & income > 0){
        return(0.01 * income)
      }
  }

}

5.2. Test

Test cases for function (long endless scroll of output not displayed in Rmd).

state_taxes(-10)
state_taxes(0)
state_taxes(8499)
state_taxes(8500)
state_taxes(8501)
state_taxes(11699)
state_taxes(11700)
state_taxes(11701)
state_taxes(13899)
state_taxes(13900)
state_taxes(13901)
state_taxes(21399)
state_taxes(21400)
state_taxes(21401)
state_taxes(80649)
state_taxes(80650)
state_taxes(80651)
state_taxes(215399)
state_taxes(215400)
state_taxes(215401)
state_taxes(1077549)
state_taxes(1077550)
state_taxes(1077551)
state_taxes(2e6)
state_taxes(10e6)
state_taxes(100e6)
state_taxes(1000e6)

state_taxes(87701-4601)

5.3. Average Reduction (taxes & cash)

Here I will look at some basic stats about how much the jackpot is reduced by taxes and by the cash discount.

jackpot_tax_check <-
lott_clean %>% 
  mutate(jackpot_taxed = jackpot - fed_taxes(jackpot) - state_taxes(jackpot),
         jackpot_taxed_prop = jackpot_taxed / jackpot,
         jackpot_both = (jackpot*cash_coeff) - fed_taxes(jackpot*cash_coeff) - state_taxes(jackpot*cash_coeff),
         jackpot_both_prop = jackpot_both / jackpot,
         jackpot_wrong_way = jackpot_taxed*cash_coeff,
         jackpot_wrong_way_prop = jackpot_wrong_way/jackpot)
## Warning: Problem with `mutate()` input `jackpot_taxed`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_taxed` is `jackpot - fed_taxes(jackpot) - state_taxes(jackpot)`.
## Warning in if (income <= 0) {: the condition has length > 1 and only the first
## element will be used
## Warning: Problem with `mutate()` input `jackpot_taxed`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_taxed` is `jackpot - fed_taxes(jackpot) - state_taxes(jackpot)`.
## Warning in if (income > 518400) {: the condition has length > 1 and only the
## first element will be used
## Warning: Problem with `mutate()` input `jackpot_taxed`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_taxed` is `jackpot - fed_taxes(jackpot) - state_taxes(jackpot)`.
## Warning in if (income <= 0) {: the condition has length > 1 and only the first
## element will be used
## Warning: Problem with `mutate()` input `jackpot_taxed`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_taxed` is `jackpot - fed_taxes(jackpot) - state_taxes(jackpot)`.
## Warning in if (income > 1077550) {: the condition has length > 1 and only the
## first element will be used
## Warning: Problem with `mutate()` input `jackpot_both`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_both` is `-...`.
## Warning in if (income <= 0) {: the condition has length > 1 and only the first
## element will be used
## Warning: Problem with `mutate()` input `jackpot_both`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_both` is `-...`.
## Warning in if (income > 518400) {: the condition has length > 1 and only the
## first element will be used
## Warning: Problem with `mutate()` input `jackpot_both`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_both` is `-...`.
## Warning in if (income <= 0) {: the condition has length > 1 and only the first
## element will be used
## Warning: Problem with `mutate()` input `jackpot_both`.
## i the condition has length > 1 and only the first element will be used
## i Input `jackpot_both` is `-...`.
## Warning in if (income > 1077550) {: the condition has length > 1 and only the
## first element will be used
jackpot_tax_check
# just taxes
quantile(x = jackpot_tax_check$jackpot_taxed_prop,
         probs = c(0, 0.01, 0.05, 0.1, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99, 1))
##        0%        1%        5%       10%       25%       50%       75%       90% 
## 0.5418379 0.5418894 0.5419365 0.5419646 0.5420561 0.5422643 0.5427035 0.5430948 
##       95%       99%      100% 
## 0.5432567 0.5446207 0.5447134
# just taxes
mean(jackpot_tax_check$jackpot_taxed_prop)
## [1] 0.5424479
# cash and then taxes
quantile(x = jackpot_tax_check$jackpot_both_prop,
         probs = c(0, 0.01, 0.05, 0.1, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99, 1))
##        0%        1%        5%       10%       25%       50%       75%       90% 
## 0.3622751 0.3623266 0.3623736 0.3624018 0.3624933 0.3627015 0.3631407 0.3635320 
##       95%       99%      100% 
## 0.3636939 0.3650579 0.3651506
# cash and then taxes
mean(jackpot_tax_check$jackpot_both_prop)
## [1] 0.362885

6. Smaller Prizes

The goal here is to calculate the expected value of the others prizes in the lottery

Odds for smaller prizes can be found here:

# total possible ticket combinations
tick_combos <- choose(n = 70, k = 5) * 25

# odds of winning each of the smaller prizes
odds_5.0 <- choose(n = 5, k = 5) * choose(n = 24, k = 1) / tick_combos                          # ~(1/12607306)
odds_4.1 <- choose(n = 5, k = 4) * choose(n = 65, k = 1) * choose(n = 1, k = 1)  / tick_combos  # ~(1/931001)
odds_4.0 <- choose(n = 5, k = 4) * choose(n = 65, k = 1) * choose(n = 24, k = 1) / tick_combos  # ~(1/38792)
odds_3.1 <- choose(n = 5, k = 3) * choose(n = 65, k = 2) * choose(n = 1, k = 1)  / tick_combos  # ~(1/14547)
odds_3.0 <- choose(n = 5, k = 3) * choose(n = 65, k = 2) * choose(n = 24, k = 1) / tick_combos  # ~(1/606)
odds_2.1 <- choose(n = 5, k = 2) * choose(n = 65, k = 3) * choose(n = 1, k = 1)  / tick_combos  # ~(1/693)
odds_1.1 <- choose(n = 5, k = 1) * choose(n = 65, k = 4) * choose(n = 1, k = 1)  / tick_combos  # ~(1/89)
odds_0.1 <- choose(n = 5, k = 0) * choose(n = 65, k = 5) * choose(n = 1, k = 1)  / tick_combos  # ~(1/37)

# calculate additional expected value of ticket, from small prizes
# method 1. without taxes
other_prizes <- 1e6  * odds_5.0 +
                10e3 * odds_4.1 +
                500  * odds_4.0 +
                200  * odds_3.1 +
                10   * odds_3.0 +
                10   * odds_2.1 +
                4    * odds_1.1 +
                2    * odds_0.1

other_prizes
## [1] 0.2469817
# method 2. with taxes
other_prizes_taxes <- (1e6  - fed_taxes(1e6)  - state_taxes(1e6))  * odds_5.0 +
                      (10e3 - fed_taxes(10e3) - state_taxes(10e3)) * odds_4.1 +
                      (500  - fed_taxes(500)  - state_taxes(500))  * odds_4.0 +
                      (200  - fed_taxes(200)  - state_taxes(200))  * odds_3.1 +
                      (10   - fed_taxes(10)   - state_taxes(10))   * odds_3.0 +
                      (10   - fed_taxes(10)   - state_taxes(10))   * odds_2.1 +
                      (4   - fed_taxes(4)   - state_taxes(4))      * odds_1.1 +
                      (2   - fed_taxes(2)   - state_taxes(2))      * odds_0.1

other_prizes_taxes
## [1] 0.1916548

7. Expected Value

The expected value is a function of:

  • net present value (cash value of prize)
  • taxes (federal & state)
  • number of tickets sold

Here, I try to identify how each of these affect the expected value of a ticket. And then graph those effects

7.1. Theoretical EVs

The goal here is to compute and then graph expected value of a ticket (EV) as function of:

  • jackpot value
  • number of tickets sold
  • whether the pot will be split, in the case of multiple winners

Must also select max # winners to cycle through (picked: 100).

Note: will compute

    1. ev: expected value, just including jackpot
    1. ev_full: expected value, including smaller jackpots as well
    1. ev_basic: expected value, if you buy tickets systematically (not randomly pick) [not fully computed out]
# preset value, to loop through & use
jackpots <- c(1, 1e3, 1e6, 10e6, 20e6, 40e6, 50e6, 75e6, 100e6, 125e6, 150e6, 175e6, 200e6, 300e6, 400e6, 500e6, 600e6,
              700e6, 800e6, 900e6, 1e9, 1.1e9, 1.2e9, 1.3e9, 1.4e9, 1.5e9, 1.6e9, 1.7e9, 1.8e9, 1.9e9, 2e9, 5e9, 10e9)
tickets <- c(1, 10, 1e3, 10e3, 100e3, 1e6, 10e6, 15e6, 20e6, 25e6, 30e6, 35e6, 40e6, 50e6, 100e6, 200e6, 500e6, 750e6, 1e9, 2e9, 5e9, 10e9)
max_winners <- 100
splits <- c(TRUE, FALSE)
total_EVs <- length(jackpots) * length(tickets) * length(splits)
total_EVs
## [1] 1452
# data frame, to save results
EV <- data.frame(jackpot = integer(total_EVs),
                 tickets = integer(total_EVs),
                 split = logical(total_EVs),
                 ev_jj = double(total_EVs),
                 ev_full = double(total_EVs),
                 ev_full_cash = double(total_EVs),
                 ev_full_taxes = double(total_EVs))

# loop and calculate all possible EVs
i <- 0
for (jackpot_i in jackpots){
  for (tickets_i in tickets){
    for (split_i in splits){
      
      i <- i + 1
      EV$jackpot[i] <- jackpot_i
      EV$tickets[i] <- tickets_i
      EV$split[i] <- split_i
      EV$ev_jj[i] <- ev_split(jackpot = jackpot_i,
                             tickets = tickets_i,
                             max_winners = max_winners,
                             prob_win = (1/302575350),
                             method = "sum",
                             split = split_i)
      
      EV$ev_full[i] <- EV$ev_jj[i] + other_prizes
      
      EV$ev_full_cash[i] <- ev_split(jackpot = jackpot_i * cash_coeff,
                                tickets = tickets_i,
                                max_winners = max_winners,
                                prob_win = (1/302575350),
                                method = "sum",
                                split = split_i) + other_prizes
      
      EV$ev_full_taxes[i] <- ev_split(jackpot = (jackpot_i * cash_coeff) - 
                                                 fed_taxes(jackpot_i * cash_coeff) -
                                                 state_taxes(jackpot_i * cash_coeff),
                                tickets = tickets_i,
                                max_winners = max_winners,
                                prob_win = (1/302575350),
                                method = "sum",
                                split = split_i) + other_prizes_taxes
    }
  }
}
EV

7.2. Graph: Theoretical EVs (panel)

EV_long <-
EV %>% 
  pivot_longer(cols = c("ev_full", "ev_full_cash", "ev_full_taxes"),
               names_to = "type",
               values_to = "ev") %>% 
  mutate(tickets = factor(tickets,
                          levels = c(1, 10, 1e3, 10e3, 100e3, 1e6, 10e6, 15e6, 20e6, 25e6, 30e6, 35e6, 40e6, 50e6, 100e6, 
                                      200e6, 500e6, 750e6, 1e9, 2e9, 5e9, 10e9),
                          labels = c("1", "10", "1k", "10k", "100k", "1m", "10m", "15m", "20m", "25m", "30m", "35m", "40m", "50m", "100m",
                                     "200m", "500m", "750m", "1b", "2b", "5b", "10b")),
         jackpot = factor(jackpot,
                          levels = c(1, 1e3, 1e6, 10e6, 20e6, 40e6, 50e6, 75e6, 100e6, 125e6, 150e6, 175e6, 200e6, 300e6,
                                     400e6, 500e6, 600e6, 700e6, 800e6, 900e6, 1e9, 1.1e9, 1.2e9, 1.3e9, 1.4e9, 1.5e9, 
                                     1.6e9, 1.7e9, 1.8e9, 1.9e9, 2e9, 5e9, 10e9),
                          labels = c("$1", "$1k", "$1m", "$10m", "$20m", "$40m", "$50m", "$75m", "$100m", "$125m", "$150m",
                                     "$175m", "$200m", "$300m", "$400m", "$500m", "$600m", "$700m", "$800m", "$900m",
                                     "$1b", "$1.1b", "$1.2b", "$1.3b", "$1.4b", "$1.5b", "$1.6b", "$1.7b", "$1.8b", "$1.9b",
                                      "$2b", "$5b", "$10b")),
         split = factor(split,
                        levels = c(FALSE, TRUE),
                        labels = c("Jackpots Never Split :)", "Jackpots Split :(")),
         type = factor(type,
                       levels = c("ev_full", "ev_full_cash", "ev_full_taxes"),
                       labels = c("Annuity Value (No Taxes)", "Cash Value (No Taxes)", "Cash Value (Fed & State Taxes)")),
         ev_print = round(ev, 2),
         sign = case_when(ev_print <2   ~ "negative",
                          ev_print == 2 ~ "zero",
                          ev_print >2   ~ "positive"),
         sign = factor(sign,
                       levels = c("negative", "zero", "positive"))) %>% 
  group_by(sign) %>% 
  mutate(ev_max_sign = max(ev_print)^(1/4), # note: raising to root, done to squish together values
         ev_min_sign = min(ev_print)^(1/3)) %>% 
  ungroup() %>% 
  mutate(alpha = case_when(sign == "positive" ~ 0.4 + 0.6 * ((ev_print^(1/4) - ev_min_sign) / ev_max_sign), # add some baseline alpha
                           sign == "negative" ~ 0.4 + 0.6 * (1 - ((ev_print^(1/3) - ev_min_sign) / ev_min_sign)),
                           sign == "zero" ~ 1))
plot_7.2 <-
EV_long %>% 
  ggplot(aes(y = tickets,
             x = jackpot,
             fill = sign,
             label = ev_print)) +
  geom_raster(alpha = EV_long$alpha) +
  geom_text(color = "white",
            size = 2) +
  geom_rect(aes(xmin = 5  - 0.5,  #which(jackpots %in% 20e6), #min(lott_clean$jackpot)
                xmax = 26 + 0.5, #which(jackpots %in% 1.5e9), #max(lott_clean$jackpot)
                ymin = 8  - 0.5,  #which(tickets %in% 15e6), #min(lott_clean$sales)
                ymax = 18 + 0.5, #which(tickets %in% 750e6), #max(lott_clean$sales)
                fill = NA),
                color = "white") +
  scale_fill_manual(values = c("red2", "green4", "grey")) +
  labs(x = "Ticket Sales",
       y = "Advertised Jackpot Size ($)",
       title = "Expected Value of A Lottery Ticket") +
  guides(fill = FALSE) +
  facet_grid(split ~ type) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 90,
                                   vjust = 0.25),
        axis.text.y = element_text(vjust = 0.25),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

plot_7.2

7.2. Save

if (plot_save){
ggsave(plot = plot_7.2,
       filename = "ev_theoretical_wide.pdf",
       width = 20,
       height = 10)
}

7.2. Theoretical EV (single)

EV_long_filtered <-
  EV_long %>% 
    filter(split == "Jackpots Split :(" & type == "Cash Value (Fed & State Taxes)") 

EV_long_filtered %>% 
  ggplot(aes(y = tickets,
             x = jackpot,
             fill = sign,
             label = format(ev_print, digits = 2))) +
  geom_raster(alpha = EV_long_filtered$alpha) +
  geom_text(color = "white",
            size = 3.5) +
  geom_rect(aes(xmin = 5  - 0.5,  #which(jackpots %in% 20e6), #min(lott_clean$jackpot)
                xmax = 26 + 0.5, #which(jackpots %in% 1.5e9), #max(lott_clean$jackpot)
                ymin = 8  - 0.5,  #which(tickets %in% 15e6), #min(lott_clean$sales)
                ymax = 18 + 0.5, #which(tickets %in% 750e6), #max(lott_clean$sales)
                fill = NA),
                color = "white") +
  scale_fill_manual(values = c("red2", "green4", "grey")) +
  labs(y = "Ticket Sales",
       x = "Advertised Jackpot Size",
       title = paste0("Expected Value of Lottery Ticket (in $), Given:",
                      "\n (a) Jackpot Size, ",
                      " (b) Number of Tickets Sold",
                      "\n (accounting for taxes & cash discount)")) +
  guides(fill = FALSE) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 90,
                                   vjust = 0.25,
                                   size = 12),
        axis.title.x = element_text(margin = margin(t = 10, b = 0, r = 0, l = 0),
                                    size = 14),
        axis.text.y = element_text(vjust = 0.25,
                                   size = 11),
        axis.title.y = element_text(size = 14),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

EV_long_filtered <-
  EV_long %>% 
    filter(split == "Jackpots Split :(" & type == "Cash Value (Fed & State Taxes)") 

EV_long_filtered %>% 
  ggplot(aes(y = tickets,
             x = jackpot,
             fill = sign,
             label = format(ev_print, digits = 2))) +
  geom_raster(alpha = EV_long_filtered$alpha) +
  geom_text(color = "white",
            size = 3.5) +
  # cross-hairs
  geom_segment(aes(x = 0, xend = 20.5, y = 11, yend = 11), linetype = "dotted", size = 1) +
  geom_segment(aes(x = 21.5, xend = length(jackpots),
                   y = 11, yend = 11), linetype = "dotted", size = 1) +
  geom_segment(aes(x = 21, xend = 21, y = 0, yend = 10.5), linetype = "dotted", size = 1) +
  geom_segment(aes(x = 21, xend = 21, y = 11.5, yend = 22), linetype = "dotted", size = 1) +
  geom_rect(aes(xmin = 20.5,  
                xmax = 21.5,
                ymin = 10.5, 
                ymax = 11.5,
                fill = NA),
                color = "black",
            linetype = "dotted",
            size = 1) +
  geom_rect(aes(xmin = 5  - 0.5,  #which(jackpots %in% 20e6), #min(lott_clean$jackpot)
                xmax = 26 + 0.5, #which(jackpots %in% 1.5e9), #max(lott_clean$jackpot)
                ymin = 8  - 0.5,  #which(tickets %in% 15e6), #min(lott_clean$sales)
                ymax = 18 + 0.5, #which(tickets %in% 750e6), #max(lott_clean$sales)
                fill = NA),
                color = NA) + #HACKY SOLUTION: just make it invisible (colors get messed up for some reason otherwise)
  scale_fill_manual(values = c("red2", "green4", "grey")) +
  labs(y = "Ticket Sales",
       x = "Advertised Jackpot Size",
       title = paste0("Expected Value of Lottery Ticket (in $), Given:",
                      "\n (a) Jackpot Size, ",
                      " (b) Number of Tickets Sold",
                      "\n (after accounting for taxes & cash discount)")) +
  guides(fill = FALSE) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 90,
                                   vjust = 0.25,
                                   size = 12),
        axis.title.x = element_text(margin = margin(t = 10, b = 0, r = 0, l = 0),
                                    size = 14),
        axis.text.y = element_text(vjust = 0.25,
                                   size = 11),
        axis.title.y = element_text(size = 14),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

7.3. EV over time

# shape data
EV_time <-
lott_clean %>% 
  rowwise() %>% 
  mutate(ev_advert_ns = ev_split(jackpot = jackpot,
                                 tickets = sales,
                                 max_winners = 100,
                                 prob_win = (1/302575350),
                                 method = "sum",
                                 split = FALSE) + other_prizes,
         ev_cash_ns = ev_split(jackpot = jackpot * cash_coeff,
                               tickets = sales,
                               max_winners = 100,
                               prob_win = (1/302575350),
                               method = "sum",
                               split = FALSE) + other_prizes,
         ev_taxes_ns = ev_split(jackpot = ((jackpot * cash_coeff) - fed_taxes(jackpot * cash_coeff) - state_taxes(jackpot * cash_coeff)),
                               tickets = sales,
                               max_winners = 100,
                               prob_win = (1/302575350),
                               method = "sum",
                               split = FALSE) + other_prizes_taxes,
         ev_advert_s = ev_split(jackpot = jackpot,
                               tickets = sales,
                               max_winners = 100,
                               prob_win = (1/302575350),
                               method = "sum",
                               split = TRUE) + other_prizes,
         ev_cash_s = ev_split(jackpot = jackpot * cash_coeff,
                              tickets = sales,
                              max_winners = 100,
                              prob_win = (1/302575350),
                              method = "sum",
                              split = TRUE) + other_prizes,
         ev_taxes_s = ev_split(jackpot = ((jackpot * cash_coeff) - fed_taxes(jackpot * cash_coeff) - state_taxes(jackpot * cash_coeff)),
                              tickets = sales,
                              max_winners = 100,
                              prob_win = (1/302575350),
                              method = "sum",
                              split = TRUE) + other_prizes) %>% 
  pivot_longer(cols = contains("ev_"),
               names_to = "ev_type",
               values_to = "ev") %>% 
  mutate(split = case_when(ev_type %in% c("ev_advert_ns", "ev_cash_ns", "ev_taxes_ns") ~ FALSE,
                           ev_type %in% c("ev_advert_s", "ev_cash_s", "ev_taxes_s") ~ TRUE),
         ev_calc = case_when(ev_type %in% c("ev_advert_ns", "ev_advert_s") ~ "advertised",
                             ev_type %in% c("ev_cash_ns", "ev_cash_s") ~ "cash",
                             ev_type %in% c("ev_taxes_ns", "ev_taxes_s") ~ "taxes"))

EV_time

7.3. Graph: EV over Time

plot_7.3 <-
EV_time %>% 
ggplot(aes(x = as.Date(date),
           y = ev,
           color = ev_calc)) +
  geom_line() +
  geom_hline(yintercept = 2) +
  scale_x_date(name = "Date",
               date_labels = "%Y") +
  scale_y_continuous(name = "Expected Value") +
  labs(title = "Expected Value, Over Time") +
  facet_grid(.~split) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

plot_7.3

7.4. Average EV (drawing weighted)

EV_mean_draw <- mean(EV_time[EV_time$ev_type == "ev_taxes_s",]$ev)
EV_mean_draw
## [1] 0.4321589

7.5. Percentile EV (drawing weighted)

quantile(x = EV_time[EV_time$ev_type == "ev_taxes_s",]$ev,
         probs = c(0, 0.01, 0.05, 0.1, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99, 1))
##        0%        1%        5%       10%       25%       50%       75%       90% 
## 0.2703832 0.2712707 0.2933206 0.2991337 0.3219430 0.3914926 0.5053136 0.6349851 
##       95%       99%      100% 
## 0.6984200 0.8124424 0.9341472

7.6. Average EV (ticket weighted)

sum((EV_time[EV_time$ev_type == "ev_taxes_s",]$ev * EV_time[EV_time$ev_type == "ev_taxes_s",]$sales))/
sum(EV_time[EV_time$ev_type == "ev_taxes_s",]$sales)
## [1] 0.5493451

7.7. Percentile EV (drawing weighted)

This one is a little more annoying. We can’t just use the quantile() function (as far as I understand, without needing to create over 11 billion rows).

The plan here is to:

  • compute cumulative total sales
  • rank all jackpots by expected value
  • calculate cumulative sales, with tickets ranked by expected value
  • convert this cumulative sales, to running percentile of sales
  • use these percentiles to examine values nearest various cut points of interest (e.g. 25%, 50%, etc)
cumulative_sales <- sum(EV_time[EV_time$ev_type == "ev_taxes_s",]$sales)

df_temp <-
  EV_time %>% 
  filter(ev_type == "ev_taxes_s") %>% 
  arrange(ev) %>% 
  mutate(sales_cumult = cumsum(sales),
         sales_percentile = sales_cumult/cumulative_sales)
df_temp
EV_ticket_weighted_percentiles <- data.frame(percentile = c(0, 1, 5, 10, 25, 50, 75, 90, 95, 99, 100),
                                             ev = c(df_temp[which.min(abs(df_temp$sales_percentile - 0.00)), ]$ev,
                                                    df_temp[which.min(abs(df_temp$sales_percentile - 0.01)), ]$ev,
                                                    df_temp[which.min(abs(df_temp$sales_percentile - 0.05)), ]$ev,
                                                    df_temp[which.min(abs(df_temp$sales_percentile - 0.10)), ]$ev,
                                                    df_temp[which.min(abs(df_temp$sales_percentile - 0.25)), ]$ev,
                                                    df_temp[which.min(abs(df_temp$sales_percentile - 0.50)), ]$ev,
                                                    df_temp[which.min(abs(df_temp$sales_percentile - 0.75)), ]$ev,
                                                    df_temp[which.min(abs(df_temp$sales_percentile - 0.90)), ]$ev,
                                                    df_temp[which.min(abs(df_temp$sales_percentile - 0.95)), ]$ev,
                                                    df_temp[which.min(abs(df_temp$sales_percentile - 0.99)), ]$ev,
                                                    df_temp[which.min(abs(df_temp$sales_percentile - 1)), ]$ev))

EV_ticket_weighted_percentiles

7.8. Graph: Single Panel

Note, solution to having boxes appear outside of plot limits can be found here:

plot_ev_basic <-
EV_time %>% 
  filter(ev_type == "ev_taxes_s") %>% 
ggplot(aes(x = jackpot,
           y = ev)) +
  # COLORED BOXES
  annotate("rect",
       xmin = -100e6,
       xmax = 1.8e9,
       ymin = -0.5,
       ymax = 2,
       fill = "red",
       alpha = 0.2) +
  annotate("rect",
       xmin = -100e6,
       xmax = 1.8e9,
       ymin = 2,
       ymax = 3.5,
       fill = "green",
       alpha = 0.2) +
  # DOWN ARROW
  annotate(geom = "text",
           x = 800e6,
           y = 1.92,
           label = "Losing Bet",
           angle = 0,
           size = 4) +
  geom_segment(aes(x = 800e6,
                   y = 1.85,
                   xend = 800e6,
                   yend = 1.6),
                  arrow = arrow(length = unit(0.3, "cm"),
                                type = "closed")) +
  # UP ARROW
  annotate(geom = "text",
           x = 800e6,
           y = 2.08,
           label = "Winning Bet",
           angle = 0,
           size = 4) +
  geom_segment(aes(x = 800e6,
                   y = 2.15,
                   xend = 800e6,
                   yend = 2.4),
                  arrow = arrow(length = unit(0.3, "cm"),
                                type = "closed")) +
  # AVERAGE EV
  geom_segment(aes(x = 50e6,
                   y = EV_mean_draw,
                   xend = -50e6,
                   yend = EV_mean_draw),
                  arrow = arrow(length = unit(0.2, "cm"),
                                type = "open")) +
  annotate(geom = "text",
           x = 0,
           y = EV_mean_draw + 0.3,
           label = paste0("Average \n Expected \n Value \n ($", round(EV_mean_draw, 2), ")"),
           angle = 0,
           size = 4) +
  # DATA
  geom_point(size = 3,
             alpha = 0.25) +
  geom_line(size = 0.85,
            alpha = 0.5) +
  geom_hline(yintercept = 2,
             color = "black",
             size = 1,
             linetype = "dotted") +
  # AXES
  scale_y_continuous(breaks = seq(0, 3, 0.25),
                     labels = scales::label_dollar()) +
  scale_x_continuous(labels = c("$0", paste0("$", seq(1, 9, 1)*100, "m"), paste0("$", seq(1, 1.6, 0.1), "b")),
                     breaks = c(0, 1:16*100e6)) +
  coord_cartesian(xlim = c(0, 1.6e9),
                  ylim = c(0, 3)) +
  # LABELS
  labs(title = paste0("Expected Value v. Jackpot Size",
                      "\n after accounting for ",
                      "(a) taxes, ",
                      "(b) cash value discount, ",
                      "(c) odds of a split pot"),
       y = "Expected Value",
       x = "Advertised Jackpot Size") +
  # STYLES
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 13,
                                  margin = margin(t = 0, b = 10, l = 0, r = 0)),
        axis.title.x = element_text(size = 13,
                                    margin = margin(t = 10, b = 0, r = 0, l = 0)),
        axis.title.y = element_text(size = 13,
                                    margin = margin(t = 0, b = 0, r = 10, l = 0)))
plot_ev_basic

7.8. Save

if (plot_save){
ggsave(plot = plot_ev_basic,
       filename = "Expected_Value_Basic.pdf",
       width = 12,
       height = 8)
}

7.9. Graph: Advertised, Cash, Taxes

plot_ev_comp3 <-
EV_time %>% 
  filter(ev_type == "ev_advert_s" | ev_type == "ev_cash_s" | ev_type == "ev_taxes_s") %>% 
  mutate(ev_type = case_when(ev_type == "ev_advert_s" ~ "As Advertised (No Cash Discount, No Taxes)",
                              ev_type == "ev_cash_s" ~ "With Cash Discount Only",
                              ev_type == "ev_taxes_s" ~ "With Taxes & Cash Discount"),
         ev_type = factor(ev_type,
                          levels = c("As Advertised (No Cash Discount, No Taxes)",
                                     "With Cash Discount Only (No Taxes)",
                                     "With Taxes & Cash Discount"))) %>% 
ggplot(aes(x = jackpot,
           y = ev,
           color = ev_type)) +
  # COLORED BOXES
  annotate("rect",
       xmin = -100e6,
       xmax = 1.8e9,
       ymin = -0.5,
       ymax = 2,
       fill = "red",
       alpha = 0.2) +
  annotate("rect",
       xmin = -100e6,
       xmax = 1.8e9,
       ymin = 2,
       ymax = ceiling(max(EV_time$ev)),
       fill = "green",
       alpha = 0.2) +
  # DOWN ARROW
  annotate(geom = "text",
           x = 150e6,
           y = 1.92,
           label = "Losing Bet",
           angle = 0,
           size = 4) +
  geom_segment(aes(x = 150e6,
                   y = 1.85,
                   xend = 150e6,
                   yend = 1.6),
                  arrow = arrow(length = unit(0.3, "cm"),
                                type = "closed"),
               color = "black") +
  # UP ARROW
  annotate(geom = "text",
           x = 150e6,
           y = 2.1,
           label = "Winning Bet",
           angle = 0,
           size = 4) +
  geom_segment(aes(x = 150e6,
                   y = 2.15,
                   xend = 150e6,
                   yend = 2.4),
                  arrow = arrow(length = unit(0.3, "cm"),
                                type = "closed"),
               color = "black") +
  # DATA
  geom_point(size = 3,
             alpha = 0.25) +
  geom_line(size = 0.85,
            alpha = 0.5) +
  geom_hline(yintercept = 2,
             color = "black",
             size = 1,
             linetype = "dotted") +
  # AXES
  scale_y_continuous(breaks = seq(0, ceiling(max(EV_time$ev)), 0.25),
                     labels = scales::label_dollar()) +
  scale_x_continuous(labels = c("$0", paste0("$", seq(1, 9, 1)*100, "m"), paste0("$", seq(1, 1.6, 0.1), "b")),
                     breaks = c(0, 1:16*100e6)) +
  scale_color_manual(values = c("blue4", "green4", "black")) +
  coord_cartesian(xlim = c(0, 1.6e9),
                  ylim = c(0, 3)) +
  # LABELS
  labs(title = paste0("Expected Value v. Jackpot Size"),
       y = "Expected Value",
       x = "Advertised Jackpot Size",
       color = "Expected Value Calculated:") +
  # STYLES
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 13,
                                  margin = margin(t = 0, b = 0, l = 0, r = 0)),
        axis.title.x = element_text(size = 13,
                                    margin = margin(t = 10, b = 0, r = 0, l = 0)),
        axis.title.y = element_text(size = 13,
                                    margin = margin(t = 0, b = 0, r = 10, l = 0)),
        axis.text.x = element_text(angle = 0),
        legend.position = "top")

plot_ev_comp3
## Warning: Removed 336 rows containing missing values (geom_point).
## Warning: Removed 336 row(s) containing missing values (geom_path).

7.9. Save

if (plot_save){
ggsave(plot = plot_ev_comp3,
       filename = "Expected_Value_Comp3.pdf",
       width = 12,
       height = 8)
}

7.10. Graph: Advertised, Cash, Taxes

plot_ev_panel <-
EV_time %>% 
  mutate(ev_calc = case_when(ev_calc == "advertised" ~ "As Advertised (No Cash Discount, No Taxes)",
                             ev_calc == "cash" ~ "With Cash Discount Only",
                             ev_calc == "taxes" ~ "With Taxes & Cash Discount"),
         ev_calc = factor(ev_calc,
                          levels = c("As Advertised (No Cash Discount, No Taxes)",
                                     "With Cash Discount Only (No Taxes)",
                                     "With Taxes & Cash Discount")),
         split = case_when(split == TRUE ~ "If Multiple Winners: Pot Gets Split",
                           split == FALSE ~ "If Multiple Winners: Each Person Gets Full Pot Value")) %>% 
ggplot(aes(x = jackpot,
           y = ev,
           color = ev_calc)) +
  # COLORED BOXES
  annotate("rect",
       xmin = -100e6,
       xmax = 1.8e9,
       ymin = -0.5,
       ymax = 2,
       fill = "red",
       alpha = 0.2) +
  annotate("rect",
       xmin = -100e6,
       xmax = 1.8e9,
       ymin = 2,
       ymax = ceiling(max(EV_time$ev)),
       fill = "green",
       alpha = 0.2) +
  # DOWN ARROW
  annotate(geom = "text",
           x = 150e6,
           y = 1.92,
           label = "Losing Bet",
           angle = 0,
           size = 4) +
  geom_segment(aes(x = 150e6,
                   y = 1.85,
                   xend = 150e6,
                   yend = 1.6),
                  arrow = arrow(length = unit(0.3, "cm"),
                                type = "closed"),
               color = "black") +
  # UP ARROW
  annotate(geom = "text",
           x = 150e6,
           y = 2.10,
           label = "Winning Bet",
           angle = 0,
           size = 4) +
  geom_segment(aes(x = 150e6,
                   y = 2.15,
                   xend = 150e6,
                   yend = 2.4),
                  arrow = arrow(length = unit(0.3, "cm"),
                                type = "closed"),
               color = "black") +
  # DATA
  geom_point(size = 3,
             alpha = 0.25) +
  geom_line(size = 0.85,
            alpha = 0.5) +
  geom_hline(yintercept = 2,
             color = "black",
             size = 1,
             linetype = "dotted") +
  # AXES
  scale_y_continuous(breaks = seq(0, ceiling(max(EV_time$ev)), 0.25),
                     labels = scales::label_dollar()) +
  scale_x_continuous(#labels = c("$0", paste0("$", seq(1, 9, 1)*100, "m"), paste0("$", seq(1, 1.6, 0.1), "b")),
                     labels = c(paste0("$", c(0, 2.5, 5, 7.5)*100, "m"), paste0("$", c(1, 1.25, 1.5), "b")),
                     #breaks = c(0, 1:16*100e6)) +
                     breaks = c(0, 2.5, 5, 7.5, 10, 12.5, 15)*100e6) +
  scale_color_manual(values = rep(c("blue4", "green4", "black"), 2)) +
  coord_cartesian(xlim = c(0, 1.6e9),
                  ylim = c(0, max(EV_time$ev))) +
  # PANELS
  facet_grid(.~split) +
  # LABELS
  labs(title = paste0("Expected Value v. Jackpot Size"),
       y = "Expected Value",
       x = "Advertised Jackpot Size",
       color = "Expected Value Calculated:") +
  # STYLES
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 13,
                                  margin = margin(t = 0, b = 0, l = 0, r = 0)),
        axis.title.x = element_text(size = 13,
                                    margin = margin(t = 10, b = 0, r = 0, l = 0)),
        axis.title.y = element_text(size = 13,
                                    margin = margin(t = 0, b = 0, r = 10, l = 0)),
        axis.text.x = element_text(angle = 0),
        legend.position = "top")

plot_ev_panel
## Warning: Removed 672 rows containing missing values (geom_point).
## Warning: Removed 672 row(s) containing missing values (geom_path).

## 7.10. Save

if (plot_save){
ggsave(plot = plot_ev_panel,
       filename = "Expected_Value_Panel.pdf",
       width = 12,
       height = 8)
}

7.11. All Positive EV Cases

[note: need to amend this, given updated calculations]

Note there have only ever been 7 cases where a ticket has had an expected value higher than the cost of a ticket. And this is only in fantasy cases, where we change or relax the actual lottery rules and payout structures. This is 7 out of 2016 hypothetical drawings considered (336 drawins X 6 possible scenarios for each).

EV_time %>% 
  filter(ev > 2)
EV_time

END/MISC

EV_time %>% 
  filter(ev_type == "ev_taxes_s" & jackpot > 900e6)