# In-class demos for simulation, 7 February 2016 # Load this week's homework data stocks <- read.csv("http://www.stat.cmu.edu/~cshalizi/uADA/17/hw/03/stock_history.csv") # Look at the cumulative returns plot(Return_cumul ~ Date, data=stocks, xlab="date", ylab="Cumulative value of initial \$1", type="l") # A log scale for the vertical axis is suggestive plot(Return_cumul ~ Date, data=stocks, xlab="date", ylab="Cumulative value of initial \$1", type="l", log="y") # Create a time series of logarithmic returns # logarithmic returns follows the visual suggestion of the last plot stocks\$logreturns <- c(diff(log(stocks\$Return_cumul)), NA) # Why the NA? plot(logreturns ~ Date, data=stocks, xlab="date", ylab="Monthly logarithmic returns",type="l") # Look at the histogram hist(stocks\$logreturns, freq=FALSE, xlab="log returns", n=51, main="Distribution of log returns") # The conventional model: log returns are IID Gaussian # Maximum-likelihood estimates of mean and variance are the sample values # Need na.rm here because of the final NA in the vector of returns (mean.logreturns <- mean(stocks\$logreturns,na.rm=TRUE)) (sd.logreturns <- sd(stocks\$logreturns,na.rm=TRUE)) curve(dnorm(x, mean=mean.logreturns, sd=sd.logreturns), add=TRUE, lwd=3) # How bad are the worst 1% of months, according to the model? # "99% value at risk" is (roughly) this fractional loss * size of portfolio # Analytical answer: qnorm(0.01,mean.logreturns,sd.logreturns) # Presumes we can work out the quantile function exactly # Simulation answer: # - Create a very long simulation run (a million points) # - Look at the sample quantile in that long run? quantile(rnorm(1e6, mean.logreturns, sd.logreturns), 0.01) # How bad are the worst 1% of two months in a row? ### Exercise: find the analytical answer in the Gaussian model # Simulation answer: # - Make another (or the same!) long run # - Take sum of consecutive pairs # - Take quantile of the sums norm.sim.run <- rnorm(1e6, mean.logreturns, sd.logreturns) quantile(head(norm.sim.run,-1) + tail(norm.sim.run,-1), 0.01) # Does the model look like the real data? # Start by plotting the real data again plot(logreturns ~ Date, data=stocks, xlab="date", ylab="Monthly logarithmic returns",type="l",lwd=2) # Now do a simulation run, matched in length to the data, and add it to the # plot, but fainter lines(stocks\$Date, rnorm(n=nrow(stocks), mean=mean.logreturns, sd=sd.logreturns), col="darkgrey", lwd=1) # This shouldn't line up perfectly (why not?), but: # - data has more comparatively large movements than the model # - spikes seem to cluster more in the data than in the model # How bad were the worst 1% of months? (q0.01 <- quantile(stocks\$logreturns, 0.01, na.rm=TRUE)) # What's the probability that the sample 1%, over this length of time, would # be at least that low? # Analytical answer: just try to find it! # Simulation answer: just try it out: sim.history.std <- function() { rnorm(n=nrow(stocks), mean=mean.logreturns, sd=sd.logreturns) } simd.1stpercent.1k <- replicate(1000, quantile(sim.history.std(), 0.01)) mean(simd.1stpercent.1k <= q0.01 ) # Let's try something with heavier tails: the t distribution library(MASS) # The fitdistr() function fits parametric distributions by maximum # likelihood; it knows about the t distribution # R trick: wrapping an assignment in parentheses, (a<-b), does the # assignment and prints out the value assigned (t.est <- fitdistr(na.omit(stocks\$logreturns),"t")) # The standard t distribution is centered around 0 and has scale 1 # fitdistr shifts the center by m and changes the scale by s # so (X-m)/s has a standard t distribution # restore the plot with the histogram hist(stocks\$logreturns, freq=FALSE, xlab="log returns", n=51, main="Distribution of log returns", ylim=c(0,15)) # and the Gaussian density curve curve(dnorm(x, mean=mean.logreturns, sd=sd.logreturns), add=TRUE, lwd=3) # Add the t-distribution density curve # R has a built-in density for the t distribution, dt(), but we need # to deal with the shift and the scale dt.fitted <- function(x,fitted.t=t.est) { m <- fitted.t\$estimate["m"] s <- fitted.t\$estimate["s"] df <- fitted.t\$estimate["df"] return((1/s)*dt((x-m)/s,df=df)) # why the (1/s) factor out front? } # Finally, plot the density of the fitted t distribution curve(dt.fitted(x),add=TRUE,col="blue",lwd=3) # Looks definitely more promising, if not necessarily right # Write something to simulate IID draws from the fitted t distributed # Uses the built-in rt(), but handles the shift and the scale rt.fitted <- function(n,fitted.t=t.est) { m <- fitted.t\$estimate["m"] s <- fitted.t\$estimate["s"] df <- fitted.t\$estimate["df"] t <- rt(n,df=df) x <- s*t + m return(x) } # How does a simulated path look? plot(logreturns ~ Date, data=stocks, xlab="date", ylab="Monthly logarithmic returns", type="l", lwd=2) lines(stocks\$Date, rt.fitted(n=nrow(stocks)), col="darkgrey", lwd=1) # Much better spikiness, but still not much clustering # How bad does this say the worst 1% of months are? quantile(rt.fitted(1e6),0.01) #### Maybe the IID part is wrong? #### # Tack 1: trends over time ("fixed effects", as in HW 2) library(np) # Fit a regression of returns on the date (a numerical variable) time.trend <- npreg(logreturns ~ Date, data=stocks, tol=1e-3, ftol=1e-3) # Restore the plot of the time series plot(logreturns ~ Date, data=stocks, xlab="date", ylab="Monthly logarithmic returns", type="l",lwd=2) # Plot the time trend # Not quite flat, if you zoom in lines(head(stocks\$Date,-1), fitted(time.trend), col="darkgrey",lwd=4) # Simulate some scatter around the time trend # Note: rnorm now returns a vector! lines(head(stocks\$Date,-1), rnorm(n=length(fitted(time.trend)), mean=fitted(time.trend), sd=sqrt(time.trend\$MSE)), col="blue", lwd=1) # Looks a bit better but still isn't clustered enough # Exercise: could you replace rnorm() above with a t distribution? # OK, what about dependence from one month to the next, rather than a fixed # effect of time? # Plot this month's returns against next plot(head(stocks\$logreturns,-1), tail(stocks\$logreturns,-1), xlab="This month's returns", ylab="Next month's returns") # The heads and tails are getting old; make a data frame and be done returns.df <- data.frame(Date=head(stocks\$Date,-1), r0=head(stocks\$logreturns,-1), r1=tail(stocks\$logreturns,-1)) returns.df <- head(returns.df,-1) # Remove an NA row # Regress next month's returns on this month's time.regression <- npreg(r1~r0, data=returns.df, tol=1e-3, ftol=1e-3) # Create a sequence of evenly spaced values on the horizontal axis for plotting plotting.r0 <- seq(from=-0.3, to=0.3, length.out=100) # Add the fitted values to the plot lines(plotting.r0, predict(time.regression, newdata=data.frame(r0=plotting.r0)), col="red",lwd=2) # Weirdness out where there are just a few data points, but a pretty # nearly linear trend through the bulk of the data abline(a=0, b=1, col="green", lwd=2) # For comparison of slope # Now, how do we use this to simulate step by step? # Start with an empty vector to store results # We could keep extending a vector, but that's very slow fake.returns <- vector(length=nrow(stocks)) # Give it an initial value for the first month fake.returns[1] <- stocks\$logreturns[1] # Now for every new month for (t in 2:nrow(stocks)) { # Work out the expected value from the previous month's simulated returns expected <- predict(time.regression, newdata=data.frame(r0=fake.returns[t-1])) # Add some noise and stick it in the vector fake.returns[t] <- expected+ rnorm(n=1,mean=0,sd=sqrt(time.regression\$MSE)) } plot(logreturns ~ Date, data=stocks, xlab="date", ylab="Monthly logarithmic returns", type="l", lwd=2) lines(stocks\$Date,fake.returns, col="purple", lwd=2) quantile(fake.returns, 0.01) # Looks rather more convincingly clustered, but not spiky enough # Exercise: turn the simulation into a function, to make it eaiser to re-run # Exercise: Replace rnorm() in the code with drawing from a t distribution