# In-class demos for simulation, 7 February 2016

# Load this week's homework data
stocks <- read.csv("http://www.stat.cmu.edu/~cshalizi/uADA/17/hw/03/stock_history.csv")

# Look at the cumulative returns
plot(Return_cumul ~ Date, data=stocks, xlab="date",
  ylab="Cumulative value of initial $1", type="l")
# A log scale for the vertical axis is suggestive
plot(Return_cumul ~ Date, data=stocks, xlab="date",
  ylab="Cumulative value of initial $1", type="l", log="y")

# Create a time series of logarithmic returns
  # logarithmic returns follows the visual suggestion of the last plot
stocks$logreturns <- c(diff(log(stocks$Return_cumul)), NA)  # Why the NA?
plot(logreturns ~ Date, data=stocks, xlab="date",
  ylab="Monthly logarithmic returns",type="l")

# Look at the histogram
hist(stocks$logreturns, freq=FALSE, xlab="log returns", n=51,
  main="Distribution of log returns")

# The conventional model: log returns are IID Gaussian
  # Maximum-likelihood estimates of mean and variance are the sample values
    # Need na.rm here because of the final NA in the vector of returns
(mean.logreturns <- mean(stocks$logreturns,na.rm=TRUE))
(sd.logreturns <- sd(stocks$logreturns,na.rm=TRUE))
curve(dnorm(x, mean=mean.logreturns, sd=sd.logreturns), add=TRUE, lwd=3)

# How bad are the worst 1% of months, according to the model?
  # "99% value at risk" is (roughly) this fractional loss * size of portfolio
# Analytical answer:
qnorm(0.01,mean.logreturns,sd.logreturns)
  # Presumes we can work out the quantile function exactly
# Simulation answer:
  # - Create a very long simulation run (a million points)
  # - Look at the sample quantile in that long run?
quantile(rnorm(1e6, mean.logreturns, sd.logreturns), 0.01)
# How bad are the worst 1% of two months in a row?
### Exercise: find the analytical answer in the Gaussian model
# Simulation answer:
  # - Make another (or the same!) long run
  # - Take sum of consecutive pairs
  # - Take quantile of the sums
norm.sim.run <- rnorm(1e6, mean.logreturns, sd.logreturns)
quantile(head(norm.sim.run,-1) + tail(norm.sim.run,-1), 0.01)

# Does the model look like the real data?
# Start by plotting the real data again
plot(logreturns ~ Date, data=stocks, xlab="date",
  ylab="Monthly logarithmic returns",type="l",lwd=2)
# Now do a simulation run, matched in length to the data, and add it to the
# plot, but fainter
lines(stocks$Date, rnorm(n=nrow(stocks), mean=mean.logreturns, sd=sd.logreturns),
  col="darkgrey", lwd=1)
# This shouldn't line up perfectly (why not?), but:
  # - data has more comparatively large movements than the model
  # - spikes seem to cluster more in the data than in the model

# How bad were the worst 1% of months?
(q0.01 <- quantile(stocks$logreturns, 0.01, na.rm=TRUE))
# What's the probability that the sample 1%, over this length of time, would
# be at least that low?
  # Analytical answer: just try to find it!
  # Simulation answer: just try it out:
sim.history.std <- function() {
    rnorm(n=nrow(stocks), mean=mean.logreturns, sd=sd.logreturns)
}
simd.1stpercent.1k <- replicate(1000, quantile(sim.history.std(), 0.01))
mean(simd.1stpercent.1k <= q0.01 )


# Let's try something with heavier tails: the t distribution
library(MASS)
  # The fitdistr() function fits parametric distributions by maximum
  # likelihood; it knows about the t distribution
  # R trick: wrapping an assignment in parentheses, (a<-b), does the
  # assignment and prints out the value assigned
(t.est <- fitdistr(na.omit(stocks$logreturns),"t"))
  # The standard t distribution is centered around 0 and has scale 1
  # fitdistr shifts the center by m and changes the scale by s
  # so (X-m)/s has a standard t distribution
# restore the plot with the histogram
hist(stocks$logreturns, freq=FALSE, xlab="log returns", n=51,
  main="Distribution of log returns", ylim=c(0,15))
# and the Gaussian density curve
curve(dnorm(x, mean=mean.logreturns, sd=sd.logreturns), add=TRUE,
  lwd=3)
# Add the t-distribution density curve
  # R has a built-in density for the t distribution, dt(), but we need
  # to deal with the shift and the scale
dt.fitted <- function(x,fitted.t=t.est) {
  m <- fitted.t$estimate["m"]
  s <- fitted.t$estimate["s"]
  df <- fitted.t$estimate["df"]
  return((1/s)*dt((x-m)/s,df=df)) # why the (1/s) factor out front?
}
# Finally, plot the density of the fitted t distribution
curve(dt.fitted(x),add=TRUE,col="blue",lwd=3)
  # Looks definitely more promising, if not necessarily right

# Write something to simulate IID draws from the fitted t distributed
  # Uses the built-in rt(), but handles the shift and the scale
rt.fitted <- function(n,fitted.t=t.est) {
  m <- fitted.t$estimate["m"]
  s <- fitted.t$estimate["s"]
  df <- fitted.t$estimate["df"]
  t <- rt(n,df=df)
  x <- s*t + m
  return(x)
}

# How does a simulated path look?
plot(logreturns ~ Date, data=stocks, xlab="date",
  ylab="Monthly logarithmic returns", type="l", lwd=2)
lines(stocks$Date, rt.fitted(n=nrow(stocks)), col="darkgrey", lwd=1)
# Much better spikiness, but still not much clustering

# How bad does this say the worst 1% of months are?
quantile(rt.fitted(1e6),0.01)

#### Maybe the IID part is wrong?  ####

# Tack 1: trends over time ("fixed effects", as in HW 2)
library(np)
# Fit a regression of returns on the date (a numerical variable)
time.trend <- npreg(logreturns ~ Date, data=stocks, tol=1e-3, ftol=1e-3)
# Restore the plot of the time series
plot(logreturns ~ Date, data=stocks, xlab="date",
  ylab="Monthly logarithmic returns", type="l",lwd=2)
# Plot the time trend
  # Not quite flat, if you zoom in
lines(head(stocks$Date,-1), fitted(time.trend), col="darkgrey",lwd=4)
# Simulate some scatter around the time trend
  # Note: rnorm now returns a vector!
lines(head(stocks$Date,-1), rnorm(n=length(fitted(time.trend)),
                                  mean=fitted(time.trend),
                                  sd=sqrt(time.trend$MSE)),
  col="blue", lwd=1)
# Looks a bit better but still isn't clustered enough
  # Exercise: could you replace rnorm() above with a t distribution?

# OK, what about dependence from one month to the next, rather than a fixed
# effect of time?
# Plot this month's returns against next
plot(head(stocks$logreturns,-1), tail(stocks$logreturns,-1),
  xlab="This month's returns", ylab="Next month's returns")
# The heads and tails are getting old; make a data frame and be done
returns.df <- data.frame(Date=head(stocks$Date,-1),
                         r0=head(stocks$logreturns,-1),
                         r1=tail(stocks$logreturns,-1))
returns.df <- head(returns.df,-1) # Remove an NA row
# Regress next month's returns on this month's
time.regression <- npreg(r1~r0, data=returns.df, tol=1e-3, ftol=1e-3)
# Create a sequence of evenly spaced values on the horizontal axis for plotting
plotting.r0 <- seq(from=-0.3, to=0.3, length.out=100)
# Add the fitted values to the plot
lines(plotting.r0, predict(time.regression, newdata=data.frame(r0=plotting.r0)),
  col="red",lwd=2)
  # Weirdness out where there are just a few data points, but a pretty
  # nearly linear trend through the bulk of the data
abline(a=0, b=1, col="green", lwd=2) # For comparison of slope
# Now, how do we use this to simulate step by step?

# Start with an empty vector to store results
  # We could keep extending a vector, but that's very slow
fake.returns <- vector(length=nrow(stocks))
# Give it an initial value for the first month
fake.returns[1] <- stocks$logreturns[1]
# Now for every new month
for (t in 2:nrow(stocks)) {
  # Work out the expected value from the previous month's simulated returns
  expected <- predict(time.regression,
                      newdata=data.frame(r0=fake.returns[t-1]))
  # Add some noise and stick it in the vector
  fake.returns[t] <- expected+
    rnorm(n=1,mean=0,sd=sqrt(time.regression$MSE))
}

plot(logreturns ~ Date, data=stocks, xlab="date",
     ylab="Monthly logarithmic returns", type="l", lwd=2)
lines(stocks$Date,fake.returns, col="purple", lwd=2)
quantile(fake.returns, 0.01)
  # Looks rather more convincingly clustered, but not spiky enough
  # Exercise: turn the simulation into a function, to make it eaiser to re-run
  # Exercise: Replace rnorm() in the code with drawing from a t distribution