## ------------------------------------------------------------------------
6e8 * (1-0.5)^(-1/(2.33-1))

## ------------------------------------------------------------------------
6e8 * (1-0.4)^(-1/(2.33-1))

## ------------------------------------------------------------------------
1e6 * (1-0.92)^(-1/(2.5-1))

## ------------------------------------------------------------------------
# Calculate quantiles of the Pareto distribution
# Inputs: desired quantile (p)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
# Outputs: the pth quantile
qpareto.1 <- function(p, exponent, threshold) {
  q <- threshold*((1-p)^(-1/(exponent-1)))
  return(q)
}

## ------------------------------------------------------------------------
qpareto.1(p=0.5,exponent=2.33,threshold=6e8)
qpareto.1(p=0.4,exponent=2.33,threshold=6e8)
qpareto.1(p=0.92,exponent=2.5,threshold=1e6)

## ------------------------------------------------------------------------
# Calculate quantiles of the Pareto distribution
# Inputs: desired quantile (p)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
  # flag for whether to give lower or upper quantiles (lower.tail)
# Outputs: the pth quantile
qpareto.2 <- function(p, exponent, threshold, lower.tail=TRUE) {
  if(lower.tail==FALSE) {
    p <- 1-p
  }
  q <- threshold*((1-p)^(-1/(exponent-1)))
  return(q)
}

## ------------------------------------------------------------------------
qpareto.2(p=0.5,exponent=2.33,threshold=6e8,lower.tail=TRUE)
qpareto.2(p=0.5,exponent=2.33,threshold=6e8)
qpareto.2(p=0.92,exponent=2.5,threshold=1e6)
qpareto.2(p=0.5,exponent=2.33,threshold=6e8,lower.tail=FALSE)
qpareto.2(p=0.92,exponent=2.5,threshold=1e6,lower.tail=FALSE)

## ------------------------------------------------------------------------
# Calculate quantiles of the Pareto distribution
# Inputs: desired quantile (p)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
  # flag for whether to give lower or upper quantiles (lower.tail)
# Outputs: the pth quantile
qpareto.3 <- function(p, exponent, threshold, lower.tail=TRUE) {
  if(lower.tail==FALSE) {
    p <- 1-p
  }
  q <- qpareto.1(p, exponent, threshold)
  return(q)
}

## ------------------------------------------------------------------------
# Calculate quantiles of the Pareto distribution
# Inputs: desired quantile (p)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
  # flag for whether to give lower or upper quantiles (lower.tail)
# Outputs: the pth quantile
qpareto.4 <- function(p, exponent, threshold, lower.tail=TRUE) {
  stopifnot(p >= 0, p <= 1, exponent > 1, threshold > 0)
  q <- qpareto.3(p,exponent,threshold,lower.tail)
  return(q)
}

## ------------------------------------------------------------------------
qpareto.4(p=0.5,exponent=2.33,threshold=6e8,lower.tail=TRUE)
qpareto.4(p=0.92,exponent=2.5,threshold=1e6,lower.tail=FALSE)
qpareto.4(p=1.92,exponent=2.5,threshold=1e6,lower.tail=FALSE)
qpareto.4(p=-0.02,exponent=2.5,threshold=1e6,lower.tail=FALSE)
qpareto.4(p=0.92,exponent=0.5,threshold=1e6,lower.tail=FALSE)
qpareto.4(p=0.92,exponent=2.5,threshold=-1,lower.tail=FALSE)
qpareto.4(p=-0.92,exponent=2.5,threshold=-1,lower.tail=FALSE)

## ----rpareto-with-deliberate-bug-----------------------------------------
# Generate random numbers from the Pareto distribution
# Inputs: number of random draws (n)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
# Outputs: vector of random numbers
rpareto <- function(n,exponent,threshold) {
  x <- vector(length=n)
  for (i in 1:n) {
    x[i] <- qpareto.4(p=rnorm(1),exponent=exponent,threshold=threshold)
  }
  return(x)
}

## ------------------------------------------------------------------------
rpareto(10)

## ----eval=FALSE----------------------------------------------------------
## traceback()
## ## 3: stopifnot(p >= 0, p <= 1, exponent > 1, threshold > 0) at #2
## ## 2: qpareto.4(p = rnorm(1), exponent = exponent, threshold = threshold) at #4
## ## 1: rpareto(10)

## ------------------------------------------------------------------------
rpareto(n=10,exponent=2.5,threshold=1)

## ------------------------------------------------------------------------
p = rnorm(1)

## ----rpareto-without-deliberate-bug--------------------------------------
# Generate random numbers from the Pareto distribution
# Inputs: number of random draws (n)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
# Outputs: vector of random numbers
rpareto <- function(n,exponent,threshold) {
  x <- vector(length=n)
  for (i in 1:n) {
    x[i] <- qpareto.4(p=runif(1),exponent=exponent,threshold=threshold)
  }
  return(x)
}

## ------------------------------------------------------------------------
rpareto(n=10,exponent=2.5,threshold=1)

## ------------------------------------------------------------------------
r <- rpareto(n=1e4,exponent=2.5,threshold=1)
qpareto.4(p=0.5,exponent=2.5,threshold=1)
quantile(r,0.5)
qpareto.4(p=0.1,exponent=2.5,threshold=1)
quantile(r,0.1)
qpareto.4(p=0.9,exponent=2.5,threshold=1)
quantile(r,0.9)

## ----simulation-vs-theory-quantiles, echo=FALSE--------------------------
simulated.percentiles <- quantile(r,(0:99)/100)
theoretical.percentiles <- qpareto.4((0:99)/100,exponent=2.5,threshold=1)
plot(theoretical.percentiles,simulated.percentiles)
abline(0,1)

## ------------------------------------------------------------------------
# Compare random draws from Pareto distribution to theoretical quantiles
# Inputs: None
# Outputs: None
# Side-effects: Adds points showing random draws vs. theoretical quantiles
  # to current plot
pareto.sim.vs.theory <- function() {
  r <- rpareto(n=1e4,exponent=2.5,threshold=1)
  simulated.percentiles <- quantile(r,(0:99)/100)
  points(theoretical.percentiles,simulated.percentiles)
}

## ----simulation-vs-theory-quantiles-many, echo=FALSE---------------------
simulated.percentiles <- quantile(r,(0:99)/100)
theoretical.percentiles <- qpareto.4((0:99)/100,exponent=2.5,threshold=1)
plot(theoretical.percentiles,simulated.percentiles)
abline(0,1)
for (i in 1:10) { pareto.sim.vs.theory() }

## ------------------------------------------------------------------------
# Compare random draws from Pareto distribution to theoretical quantiles
# Inputs: Graphical arguments, passed to points()
# Outputs: None
# Side-effects: Adds points showing random draws vs. theoretical quantiles
  # to current plot
pareto.sim.vs.theory <- function(...) {
  r <- rpareto(n=1e4,exponent=2.5,threshold=1)
  simulated.percentiles <- quantile(r,(0:99)/100)
  points(theoretical.percentiles,simulated.percentiles,...)
}

## ----simulation-vs-theory-quantiles-many-2, echo=FALSE-------------------
simulated.percentiles <- quantile(r,(0:99)/100)
theoretical.percentiles <- qpareto.4((0:99)/100,exponent=2.5,threshold=1)
plot(theoretical.percentiles,simulated.percentiles)
abline(0,1)
for (i in 1:10) {
  pareto.sim.vs.theory(pch=i,type="b",lty=i)
}

## ------------------------------------------------------------------------
# Check Pareto random number generator, by repeatedly generating random draws
  # and comparing them to theoretical quantiles
# Inputs: Number of random points to generate per replicate (n)
  # exponent of distribution (exponent)
  # lower threshold of distribution (threshold)
  # number of replicates to run (B)
# Outputs: None
# Side-effects: Creates new plot, plots simulated points vs. theory
check.rpareto <- function(n=1e4,exponent=2.5,threshold=1,B=10) {
  # One set of percentiles for everything
  theoretical.percentiles <- qpareto.4((0:99)/100,exponent=exponent,
                                       threshold=threshold)
  # Set up plotting window, but don't put anything in it:
  plot(0,type="n", xlim=c(0,max(theoretical.percentiles)),
       # No more horizontal room than we need
       ylim=c(0,1.1*max(theoretical.percentiles)),
       # Allow some extra vertical room for noise
       xlab="theoretical percentiles", ylab="simulated percentiles",
       main = paste("exponent = ", exponent, ", threshold = ", threshold))
  # Diagonal, for visual reference
  abline(0,1)
  for (i in 1:B) {
    pareto.sim.vs.theory(n=n,exponent=exponent,threshold=threshold,
                            pch=i,type="b",lty=i)
  }
}

## ------------------------------------------------------------------------
# Compare random draws from Pareto distribution to theoretical quantiles
# Inputs: Graphical arguments, passed to points()
# Outputs: None
# Side-effects: Adds points showing random draws vs. theoretical quantiles
  # to current plot
pareto.sim.vs.theory <- function(n,exponent,threshold,...) {
  r <- rpareto(n=n,exponent=exponent,threshold=threshold)
  simulated.percentiles <- quantile(r,(0:99)/100)
  points(theoretical.percentiles,simulated.percentiles,...)
}

## ----theory-vs-simulation-adequate, echo=FALSE---------------------------
check.rpareto()

## ----simulation-vs-theory-bad, echo=FALSE--------------------------------
check.rpareto(n=1e4,exponent=2.33,threshold=9e8)

## ------------------------------------------------------------------------
x <- 7
x
square <- function(y) { x <- y^2; return(x) }
square(7)
x

## ------------------------------------------------------------------------
# Compare random draws from Pareto distribution to theoretical quantiles
# Inputs: Number of random points to generate (n)
  # exponent of distribution (exponent)
  # lower threshold of distribution (threshold)
  # graphical arguments, passed to points() (...)
# Outputs: None
# Side-effects: Adds points showing random draws vs. theoretical quantiles
  # to current plot
pareto.sim.vs.theory <- function(n,exponent,threshold,...) {
  r <- rpareto(n=n,exponent=exponent,threshold=threshold)
  theoretical.percentiles <- qpareto.4((0:99)/100,exponent=exponent,
                                       threshold=threshold)
  simulated.percentiles <- quantile(r,(0:99)/100)
  points(theoretical.percentiles,simulated.percentiles,...)
}

## ------------------------------------------------------------------------
# Compare random draws from Pareto distribution to theoretical quantiles
# Inputs: Graphical arguments, passed to points()
# Outputs: None
# Side-effects: Adds points showing random draws vs. theoretical quantiles
  # to current plot
check.rpareto <- function(n=1e4,exponent=2.5,threshold=1,B=10) {
  # One set of percentiles for everything
  theoretical.percentiles <- qpareto.4((0:99)/100,exponent=exponent,
    threshold=threshold)
  # Set up plotting window, but don't put anything in it:
  plot(0,type="n", xlim=c(0,max(theoretical.percentiles)),
    # No more horizontal room than we need
    ylim=c(0,1.1*max(theoretical.percentiles)),
    # Allow some extra vertical room for noise
    xlab="theoretical percentiles", ylab="simulated percentiles",
    main = paste("exponent = ", exponent, ", threshold = ", threshold))
  # Diagonal, for visual reference
  abline(0,1)
  for (i in 1:B) {
    pareto.sim.vs.theory(n=n,exponent=exponent,threshold=threshold,
      theoretical.percentiles=theoretical.percentiles,
      pch=i,type="b",lty=i)
  }
}

# Compare random draws from Pareto distribution to theoretical quantiles
# Inputs: Number of random points to generate (n)
  # exponent of distribution (exponent)
  # lower threshold of distribution (threshold)
  # vector of theoretical percentiles (theoretical.percentiles)
  # graphical arguments, passed to points()
# Outputs: None
# Side-effects: Adds points showing random draws vs. theoretical quantiles
  # to current plot
pareto.sim.vs.theory <- function(n,exponent,threshold,
  theoretical.percentiles,...) {
  r <- rpareto(n=n,exponent=exponent,threshold=threshold)
  simulated.percentiles <- quantile(r,(0:99)/100)
  points(theoretical.percentiles,simulated.percentiles,...)
}

## ----simulation-vs-theory-good, echo=FALSE-------------------------------
check.rpareto(1e4,2.33,9e8)

## ----rpareto-with-replicate----------------------------------------------
# Generate random numbers from the Pareto distribution
# Inputs: number of random draws (n)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
# Outputs: vector of random numbers
rpareto <- function(n,exponent,threshold) {
  x <- replicate(n,qpareto.4(p=runif(1),exponent=exponent,threshold=threshold))
  return(x)
}

## ----rpareto-vectorized--------------------------------------------------
# Generate random numbers from the Pareto distribution
# Inputs: number of random draws (n)
  # exponent of the distribution (exponent)
  # lower threshold of the distribution (threshold)
# Outputs: vector of random numbers
rpareto <- function(n,exponent,threshold) {
  x <- qpareto.4(p=runif(n),exponent=exponent,threshold=threshold)
  return(x)
}

## ------------------------------------------------------------------------
# Calculate Huber's loss function
# Input: vector of numbers x
# Return: x^2 for |x|<1, 2|x|-1 otherwise
huber <- function(x) {
  n <- length(x)
  y <- vector(n)
  for (i in 1:n) {
    if (abs(x) <= 1) {
      y[i] <- x[i]^2
    } else {
      y[i] <- 2*abs(x[i])-1
    }
  }
  return(y)
}

## ------------------------------------------------------------------------
# Calculate Huber's loss function
# Input: vector of numbers x
# Return: x^2 for |x|<1, 2|x|-1 otherwise
huber <- function(x) {
  return(ifelse(abs(x) <= 1, x^2, 2*abs(x)-1))
}

## ----huber, echo=FALSE, out.width="0.5\\textwidth"-----------------------
curve(x^2,col="grey",from=-5,to=5,ylab="")
curve(huber,add=TRUE)

## ------------------------------------------------------------------------
y <- c(0.9,0.99,0.999,0.99999)
lapply(y,qpareto.4,exponent=2.5,threshold=1)

## ------------------------------------------------------------------------
sapply(y,qpareto.4,exponent=2.5,threshold=1)

## ------------------------------------------------------------------------
sapply(fiveyear.models.2, coefficients)

## ------------------------------------------------------------------------
gaussian.mle <- function(x) {
  n <- length(x)
  mean.est <- mean(x)
  var.est <- var(x)*(n-1)/n
  est <- list(mean=mean.est, sd=sqrt(var.est))
  return(est)
}

## ------------------------------------------------------------------------
x <- 1:10
mean(x)
var(x) * (length(x)-1)/length(x)
sqrt(var(x) * (length(x)-1)/length(x))
gaussian.mle(x)

## ------------------------------------------------------------------------
x <- rnorm(n=100,mean=3,sd=2)

## ------------------------------------------------------------------------
x <- rnorm(n=100,mean=3,sd=2)
median(x)

## ------------------------------------------------------------------------
medians <- vector(length=100)
for (i in 1:100) {
  x <- rnorm(n=100,mean=3,sd=2)
  medians[i] <- median(x)
}
se.in.median <- sd(medians)

## ------------------------------------------------------------------------
# Inputs: None (everything is hard-coded)
# Output: the standard error in the median
find.se.in.median <- function() {
  # Set up a vector to store the simulated medians
  medians <- vector(length=100)
  # Do the simulation 100 times
  for (i in 1:100) {
    x <- rnorm(n=100,mean=3,sd=2) # Simulate
    medians[i] <- median(x) # Calculate the median of the simulation
  }
  se.in.median <- sd(medians) # Take standard deviation
  return(se.in.median)
}

## ------------------------------------------------------------------------
# Inputs: Number of replicates (B)
# Output: the standard error in the median
find.se.in.median <- function(B) {
  # Set up a vector to store the simulated medians
  medians <- vector(length=B)
  # Do the simulation B times
  for (i in 1:B) {
    x <- rnorm(n=100,mean=3,sd=2) # Simulate
    medians[i] <- median(x) # Calculate median of the simulation
  }
  se.in.median <- sd(medians) # Take standard deviation
  return(se.in.median)
}

## ------------------------------------------------------------------------
find.se.in.median.exp <- function(B) {
  # Set up a vector to store the simulated medians
  medians <- vector(length=B)
  # Do the simulation B times
  for (i in 1:B) {
    x <- rexp(n=37,rate=2) # Simulate
    medians[i] <- median(x) # Calculate median of the simulation
  }
  se.in.median <- sd(medians) # Take standard deviation
  return(se.in.median)
}

## ------------------------------------------------------------------------
# Inputs: number of replicates (B)
  # flag for whether to use a normal or an exponential (use.norm)
# Output: The standard error in the median
find.se.in.median <- function(B,use.norm=TRUE) {
  medians <- vector(length=B)
  for (i in 1:B) {
    if (use.norm) {
      x <- rnorm(100,3,2)
    } else {
      x <- rexp(37,2)
    }
    medians[i] <- median(x)
  }
  se.in.median <- sd(medians)
  return(se.in.median)
}

## ------------------------------------------------------------------------
# Inputs: Number of replicates (B)
  # Simulator function (simulator)
# Presumes: simulator is a no-argument function which produce a vector of
  # numbers
# Output: The standard error in the media
find.se.in.median <- function(B,simulator) {
  median <- vector(length=B)
  for (i in 1:B) {
    x <- simulator()
    medians[i] <- median(x)
  }
  se.in.medians <- sd(medians)
  return(se.in.medians)
}

## ------------------------------------------------------------------------
# Inputs: None
# Output: ten draws from the mean 3, s.d. 2 Gaussian
simulator.1 <- function() {
  return(rnorm(10,3,2))
}

## ------------------------------------------------------------------------
# Inputs: None
# Output: 37 draws from the rate 2 exponential
simulator.2 <- function() {
  return(rexp(37,2))
}

## ------------------------------------------------------------------------
find.se.in.median(B=100, simulator=simulator.2)

## ------------------------------------------------------------------------
# Perturb the currency-undervaluation data by re-sampling and fit a kernel
  # regression for growht on initial GDP and undervaluation
# Inputs: None
# Output: The fitted growth rates from a new kernel regression
simulator.3 <- function() {
  # Make sure the np library is loaded
  require(np)
  # If we haven't already loaded the data, load it
  if (!exists("uv")) {
    uv <- read.csv("http://www.stat.cmu.edu/~cshalizi/uADA/16/hw/02/uv.csv")
  }
  # How big is the data set?
  n <- nrow(uv)
  # Treat the data set like a population and draw a sample
  resampled.rows <- sample(1:n,size=n,replace=TRUE)
  uv.r <- uv[resampled.rows,]
  # See the chapter on smoothing for the following incantation
  fit <- npreg(growth~log(gdp)+underval, data=uv.r, tol=1e-2, ftol=1e-2)
  growth.rates <- fitted(fit)
  return(growth.rates)
}

## ------------------------------------------------------------------------
find.se.in.median(B=10, simulator=simulator.3)

## ------------------------------------------------------------------------
# Inputs: number of replicates (B)
  # Simulator function (simulator)
# Presumes: simulator is a no-argument function which produces a vector of
  # numbers
# Outputs: Standard error in the median of the output of simulator
find.se.in.median <- function(B,simulator) {
  medians <- replicate(B, median(simulator()))
  se.in.median <- sd(medians)
  return(se.in.median)
}

## ------------------------------------------------------------------------
# Inputs: number of replicates (B)
  # Simulator function (simulator)
# Presumes: simulator is a no-argument function which produces a vector of
  # numbers
# Outputs: Interquartile range of the median of the output of simulator
find.iqr.of.median <- function(B,simulator) {
  medians <- replicate(B,median(simulator()))
  iqr.of.median <- IQR(medians)
  return(iqr.of.median)
}

## ------------------------------------------------------------------------
# Inputs: number of replicates (B)
  # Simulator function (simulator)
# Presumes: simulator is a no-argument function which produces a vector of
  # numbers
# Outputs: Standard error of the mean of the output of simulator
find.se.of.mean <- function(B,simulator) {
  means <- replicate(B,mean(simulator()))
  se.of.mean <- sd(means)
  return(se.of.mean)
}

## ------------------------------------------------------------------------
# Inputs: number of replicates (B)
  # Simulator function (simulator)
  # Estimator function (estimator)
  # Sample summarizer function (summarizer)
# Presumes: simulator is a no-argument function which produces a vector of
  # numbers
  # estimator is a function that takes a vector of numbers and produces one
  # output
  # summarizer takes a vector of outputs from estimator
# Outputs: Summary of the simulated distribution of estimates
summarize.sampling.dist.of.estimates <- function(B,simulator,estimator,
                                                 summarizer) {
  estimates <- replicate(B,estimator(simulator()))
  return(summarizer(estimates))
}

## ------------------------------------------------------------------------
bootstrap <- function(B,simulator,estimator,summarizer) {
  estimates <- replicate(B,estimator(simulator()))
  return(summarizer(estimates))
}

## ------------------------------------------------------------------------
bootstrap(B=100,simulator=simulator.1, estimator=median, summarizer=sd)