# In-class demos for simulation, 28 January 2016 # Load this week's homework data stocks <- read.csv("http://www.stat.cmu.edu/~cshalizi/uADA/16/hw/03/stock_history.csv") # Look at the cumulative returns plot(Return_cumul ~ Date, data=stocks,xlab="date", ylab="Cumulative value of initial $1",type="l") plot(Return_cumul ~ Date, data=stocks,xlab="date", ylab="Cumulative value of initial $1",type="l",log="y") # Create a time series of logarithmic returns stocks$logreturns <- c(diff(log(stocks$Return_cumul)),NA) plot(logreturns ~ Date, data=stocks,xlab="date", ylab="Monthly logarithmic returns",type="l") # Look at the histogram hist(stocks$logreturns, freq=FALSE, xlab="log returns",n=51, main="Distribution of log returns") # The conventional model: log returns are IID Gaussian # Maximum-likelihood estimates of mean and variance are the sample values # Need na.rm here because of the final NA in the vector of returns (mean.logreturns <- mean(stocks$logreturns,na.rm=TRUE)) (sd.logreturns <- sd(stocks$logreturns,na.rm=TRUE)) curve(dnorm(x,mean=mean.logreturns,sd=sd.logreturns),add=TRUE, lwd=3) # How bad are the worst 1% of months, according to the model? # "99% value at risk" is (roughly) this fractional loss * size of portfolio # Analytical answer: qnorm(0.01,mean.logreturns,sd.logreturns) # Presumes we can work out the quantile function exactly # Simulation answer: # - Create a very long simulation run (a million points) # - Look at the sample quantile in that long run? quantile(rnorm(1e6,mean.logreturns,sd.logreturns),0.01) # How bad are the worst 1% of two months in a row? ### Exercise: find the analytical answer in the Gaussian model # Simulation answer: # - Make another (or the same!) long run # - Take sum of consecutive pairs # - Take quantile of the sums norm.sim.run <- rnorm(1e6,mean.logreturns,sd.logreturns) quantile(head(norm.sim.run,-1) + tail(norm.sim.run,-1),0.01) # Does the model look like the real data? # Start by plotting the real data again plot(logreturns ~ Date, data=stocks,xlab="date", ylab="Monthly logarithmic returns",type="l",lwd=2) # Now do a simulation run, matched in length to the data, and add it to the # plot, but fainter lines(stocks$Date, rnorm(n=nrow(stocks),mean=mean.logreturns,sd=sd.logreturns), col="darkgrey",lwd=1) # This shouldn't line up perfectly (why not?), but: # - data has more comparatively large movements than the model # - spikes seem to cluster more in the data # How bad were the worst 1% of months? (q0.01 <- quantile(stocks$logreturns, 0.01, na.rm=TRUE)) # What's the probability that the sample 1%, over this length of time, would # be at least that low? # Analytical answer: just try to find it! # Simulation answer: just try it out sim.history.std <- function() { rnorm(n=nrow(stocks), mean=mean.logreturns, sd=sd.logreturns) } simd.1stpercent.1k <- replicate(1000, quantile(sim.history.std(), 0.01)) mean(simd.1stpercent.1k <= q0.01 ) # Let's try something with heavier tails: the t distribution library(MASS) # The fitdistr() function fits parametric distributions by maximum # likelihood; it knows about the t distribution # R trick: wrapping an assignment in parentheses, (a<-b), does the # assignment and prints out the value assigned (t.est <- fitdistr(na.omit(stocks$logreturns),"t")) # The standard t distribution is centered around 0 and has scale 1 # fitdistr shifts the center by m and changes the scale by s # so (X-m)/s has a standard t distribution # restore the plot with the histogram hist(stocks$logreturns, freq=FALSE, xlab="log returns",n=51, main="Distribution of log returns",ylim=c(0,15)) # and the Gaussian density curve curve(dnorm(x,mean=mean.logreturns,sd=sd.logreturns),add=TRUE, lwd=3) # Add the t-distribution density curve # R has a built-in density for the t distribution, dt(), but we need # to deal with the shift and the scale dt.fitted <- function(x,fitted.t=t.est) { m <- fitted.t$estimate["m"] s <- fitted.t$estimate["s"] df <- fitted.t$estimate["df"] return((1/s)*dt((x-m)/s,df=df)) # what the (1/s) factor? } # Finally, plot the density of the fitted t distribution curve(dt.fitted(x),add=TRUE,col="blue",lwd=3) # Looks definitely more promising, if not necessarily right # Write something to simulate IID draws from the fitted t distributed # Uses the built-in rt(), but handles the shift and the scale rt.fitted <- function(n,fitted.t=t.est) { m <- fitted.t$estimate["m"] s <- fitted.t$estimate["s"] df <- fitted.t$estimate["df"] t <- rt(n,df=df) x <- s*t + m return(x) } # How does a simulated path look? plot(logreturns ~ Date, data=stocks,xlab="date", ylab="Monthly logarithmic returns",type="l",lwd=2) lines(stocks$Date, rt.fitted(n=nrow(stocks)),col="darkgrey",lwd=1) # Much better spikiness, but still not much clustering # How bad does this say the worst 1% of months are? quantile(rt.fitted(1e6),0.01) #### Maybe the IID part is wrong? #### # Tack 1: trends over time ("fixed effects", as in HW 2) library(np) # Fit a regression of returns on the date (a numerical variable) time.trend <- npreg(logreturns ~ Date, data=stocks,tol=1e-3,ftol=1e-3) # Restore the plot of the time series plot(logreturns ~ Date, data=stocks,xlab="date", ylab="Monthly logarithmic returns",type="l",lwd=2) # Plot the time trend # Not quite flat, if you zoom in lines(head(stocks$Date,-1), fitted(time.trend), col="darkgrey",lwd=4) # Simulate some scatter around the time trend. # Note: rnorm now returns a vector! lines(head(stocks$Date,-1), rnorm(n=length(fitted(time.trend)), mean=fitted(time.trend), sd=sqrt(time.trend$MSE)), col="blue",lwd=1) # Looks a bit better but still isn't clustered enough # Exercise: could you replace rnorm() above with a t distribution? # OK, what about dependence from one month to the next, rather than a fixed # effect of time? # Plot this month's returns against next plot(head(stocks$logreturns,-1),tail(stocks$logreturns,-1), xlab="This month's returns",ylab="Next month's returns") # The heads and tails are getting old; make a data frame and be done returns.df <- data.frame(Date=head(stocks$Date,-1), r0=head(stocks$logreturns,-1),r1=tail(stocks$logreturns,-1)) returns.df <- head(returns.df,-1) # Remove an NA row # Regress next month's returns on this month's time.regression <- npreg(r1~r0,data=returns.df,tol=1e-3,ftol=1e-3) # Create a sequence of evenly spaced values on the horizontal axis for plotting plotting.r0 <- seq(from=-0.3,to=0.3,length.out=100) # Add the fitted values to the plot lines(plotting.r0,predict(time.regression,newdata=data.frame(r0=plotting.r0)), col="red",lwd=2) # Weirdness out where there are just a few data points, but a pretty # nearly linear trend through the bulk of the data abline(a=0,b=1,col="green",lwd=2) # For comparison of slope # Now, how do we use this to simulate step by step? # Start with an empty vector to store results # We could keep extending a vector, but that's very slow fake.returns <- vector(length=nrow(stocks)) # Give it an initial value for the first month fake.returns[1] <- stocks$logreturns[1] # Now for every new month for (t in 2:nrow(stocks)) { # Work out the expected value from the previous month's simulated # returns expected <- predict(time.regression, newdata=data.frame(r0=fake.returns[t-1])) # Add some noise and stick it in the vector fake.returns[t] <- expected+ rnorm(n=1,mean=0,sd=sqrt(time.regression$MSE)) } plot(logreturns ~ Date, data=stocks,xlab="date", ylab="Monthly logarithmic returns",type="l",lwd=2) lines(stocks$Date,fake.returns,col="purple",lwd=2) quantile(fake.returns,0.01) # Looks rather more convincingly clustered, but not spiky enough # Exercise: turn the simulation into a function, to make it eaiser to # re-run # Exercise: Replace rnorm() in the code with drawing from a t distribution