# Splus exploratory data analysis for survival data with subject epochs # Dec 2000 Howard Seltman and Terri Daigneault # See SurvEDATest.q # SurvEDA() plots data with multiple event times per subject. # dtf is the name of the data.frame (usually created with read.table()). # epochs is a vector of column numbers or names that specify (in order) # the event times in "dtf". The columns can be numeric, string dates, # or factor dates. # zero is the name or number of the column which is assigned time zero. # This time is lined up on the plot for all subjects. # markers is a vector of column names, blanks, or characters that indicate # what goes on the plot before the first time, at each subsequent # event time, and after the last event. It's length may be as large as # length(epochs), or shorter if latter positons need no markers. # E.g. c("id","","X") puts the subject's id to the left of the time # line, and puts an X between events 2 and 3. (Note: markers may also be # an n by length(epochs) character matrix.) # markadj allows left/right adustment of the placement of the before-the- # timeline and after-the-timeline markers. E.g. c(2,1) leaves 2 # character's worth of space between the left label and the timeline, # and 1 character's worth between the timeline and the right label. # mcex is a single number that sets the size of the marker text where 1 # is standard size. # markwidth is a numeric vector that specifies the maximum number of characters # displayed for each marker. # vert is a vector of trues (T) and falses (F) that determines where # vertical separators are used to mark event times, e.g. c(T,F,T) # indicates vertical separators at the start of event 1 and between # events 2 and 3. (F's are added if "vert" is too short.) # colors is a vector of size 1 or length(epochs)-1 that specifies the colors # used for each time line segment # sortcol is a number, column name or a pair of such that specifies the # ordering of the subjects in the plot. If a pair is used, the # second column resolves ties. # reverse can be set to T to put the highest sorted values toward the top # of the graph instead of the bottom. # groups is a column names or number or a vector of the same length as the data # which specifies groups of subjects to be placed on separate plots. # timediv is a time divisor number that e.g. can be set to 7 to change days # to weeks on the plot. # timeadj is a number subtracted from X for plotting when zero is unspecified # acex is a single number that sets the size of the text for the axis lables # where 1 is standard size and 1.3 is the default. # tcex is a single number that sets the size of the text for the title(s) # where 1 is standard size and 1.3 is the default. # xlab is the x axis label (default is "Time") # ylab is the y axis label (default is "Subject") # xladj is the number of extra character widths desired on the left (- for fewer). # xradj is the number of extra character widths desired on the right (- for fewer). # ltrim is a number, left of which timelines are trimmed # rtrim is a number, right of which timelines are trimmed # titl is a string or vector of strings that specify the titles for each # group. The default is the group ids (or the data.frame name if there # is only one group). # rows and cols specify how to divide up the plot surface if you want # multiple plots per page. # overtitl is the overall title, especially useful when there are multiple groups. # ocex is a single number that sets the size of the text for the overall title # where 1 is standard size and 1.4 is the default. # file is the name of a file to recieve the postscript plot (".ps" is added). # horiz can be set to F is a vertical plot is desired for the output file. # pause can be set to F so that when groups spill over to multiple plot # pages you won't get a "Click on the graph to continue" message. # eventsdtf is the dataframe for events # id is a string containing the column names to match dtf to eventsdtf # eventdate is a string containing the event date column name # eventyadj is a positive or negative value adjusting event y axis location # eventchar is a character or dtf column name for plotting event characters # ecex is the event character size # new is a logical value; if F, events are added to an existing plot SurvEDA=function(dtf, epochs, zero=NULL, markers=NULL, markadj=NULL, mcex=1, markwidth=NULL, vert=NULL, colors=2:(1+length(epochs)), sortcol=NULL, reverse=F, groups=NULL, timediv=1, timeadj=0, eventsdtf=NULL, id="", eventdate="", eventyadj=0, eventchar="*", ecex=0.7, acex=1.3, tcex=1.3, xlab="Time", ylab="Subject", xladj=0, xradj=0, ltrim=NULL, rtrim=NULL, titl=NULL, rows=1, cols=1, overtitl=NULL, ocex=1.4, file=NULL, horiz=T, printit=F, pause=T, new=T) { # First argument must be a data.frame if (!is.data.frame(dtf)) stop(paste(deparse(substitute(dtf)),"is not a data.frame!")) # Convert epochs to correct time scale Tim=as.matrix(dtf[,epochs]) # Convert character or factor dates to days tmp=unlist(lapply(lapply(Tim,mode),match,table=c("numeric","character"))) if (!all(tmp==1) && !all(tmp==2)) stop("Epochs must be all numeric or all character dates") if (tmp[1]==2 || all(unlist(lapply(Tim,is.factor)))) { jnk=Tim Tim=matrix(NA,nrow(jnk),ncol(jnk)) for (i in 1:ncol(Tim)) Tim[,i]=as.numeric(dates(as.character(jnk[,i]))) } K=ncol(Tim)-1 # Convert, e.g. days to weeks if (timediv!=1) Tim=Tim/timediv # Align data to "zero" point if (!is.null(zero)) { if (mode(zero)!=mode(epochs)) stop("epochs and zero must be of the same data type") Zero=(1:(K+1))[epochs==zero] sub=matrix(Tim[,Zero],nrow(Tim),K+1) Tim=Tim-sub } Trange=range(Tim,na.rm=T) if (!is.null(ltrim)) Trange=c(max(Trange[1],ltrim), Trange[2]) if (!is.null(rtrim)) Trange=c(Trange[1], min(Trange[2],rtrim)) Trange=Trange-timeadj # Setup groups if (is.null(groups)) { groups=rep(1,nrow(dtf)) groupids=1 if (is.null(titl)) titl=deparse(substitute(dtf)) grpcounts=nrow(dtf) } else { if (length(groups)>1 && length(groups)!=nrow(dtf)) stop("groups must be of length 1 or nrow(dtf)") if (length(groups)==1) groups=dtf[,groups] groupids=sort(unique(groups)) grpcounts=table(groups) if (min(grpcounts)==1 || length(grpcounts)!=length(groupids)) { print(grpcounts) print("Warning: Groups with <=1 member are not shown") grpcounts=grpcounts[grpcounts>0] } if (is.null(titl)) titl=paste(as.character(groupids)) else if (length(titl)!=length(groupids)) stop("You need one title per group") } # Setup for multiple plots per page and for file saving par(mfrow=c(rows,cols)) if (!is.null(overtitl)) par(oma=c(1,1,1.5+ocex,1)) if (!is.null(file)) { if (!(substring(file,nchar(file)-2,nchar(file))==".ps")) tmp=".ps" else tmp="" if (length(groupids)>rows*cols) FileSep=paste(LETTERS[1:(ceiling(length(groupids)/rows/cols))], tmp, sep="") else FileSep=tmp } # Setup colors if (length(colors)=K+1) { if (any(names(dtf)==markers[K+1])) Rextrachr=min(markwidth[K+1],max(nchar(as.character(dtf[,markers[K+1]])))) else Rextrachr=min(markwidth[K+1],nchar(markers[K+1])) } } if (!is.null(markadj)) { Lextrachr=Lextrachr+markadj[1] if (length(markadj)>1) Rextrachr=Lextrachr+markadj[2] } datawidth=totchr-Lextrachr-Rextrachr chrusr=diff(Trange)/datawidth*mcex # width of marker char in usr units if (is.null(markadj)) markadj=c(-chrusr*0.5,rep(0,K-1),chrusr*0.5) else markadj=c(-chrusr*markadj[1], rep(0,K-1), chrusr*ifelse(length(markadj)==1,0.5,markadj[2])) # preparations for plotting Trange[1]=Trange[1]-chrusr*(Lextrachr+xladj) Trange[2]=Trange[2]+chrusr*(Rextrachr+xradj) # For each group nfig=rows*cols for (igrp in 1:length(groupids)) { if (grpcounts[igrp]<2) next grp=groupids[igrp] gdtf=dtf[groups==grp,] ATitle=titl[igrp] # Number of subjects N=nrow(gdtf) # Time (epoch) data stored in X X=Tim[groups==grp,] # Set up for sorting if (is.null(sortcol)) ord=1:N else if (is.character(sortcol[1])) { if (length(sortcol)>2) stop("Only 2 sort columns implemented") for (i in 1:length(sortcol)) { if (!any(names(gdtf)==sortcol[i])) stop(paste("sortcol",sortcol,"not in",deparse(substitute(gdtf)))) } if (length(sortcol)==1) ord=order(gdtf[,sortcol]) else ord=order(gdtf[,sortcol[1]],gdtf[,sortcol[2]]) } else { if (any(sortcol<1) || any(sortcol>K)) stop(paste("sortcol must be between 1 and",K)) dif=t(apply(as.matrix(X),1,function(x)diff(unlist(x)))) if (length(sortcol)==1) ord=order(dif[,sortcol]) else ord=order(dif[,sortcol[1]],dif[,sortcol[2]]) } if (!reverse) ord=rev(ord) # Set up subject timeline marks (labels) marks=matrix("",N,K+1) if (!is.null(markers)) { if (is.matrix(markers)) { marks=markers } else { for (i in 1:LM) { if (markers[i]!="") { if (any(names(gdtf)==markers[i])) marks[,i]=substring(as.character(gdtf[,markers[i]]),1,markwidth[i]) else marks[,i]=rep(markers[i],N) } } } } # Setup vertical timeline segement separators vertsep=rep(F,K+1) if (!is.null(vert)) { if (mode(vert)!="logical" || any(is.na(vert))) stop("vert must be only T's and F's") vertsep[1:length(vert)]=vert vertht=min(0.4,0.15) } # preparations for plotting par(mar=c(6.1,6.1,5.1,2.1)+(acex>1)*c(acex-1,acex-1,0,0)) # Plotting if (new) { if (inR) { plot(Trange, c(0,N+1), xlab=xlab, ylab=ylab, type="n", cex.lab=acex) } else { plot(Trange, c(0,N+1), xlab=xlab, ylab=ylab, type="n", cex=acex) } } TrueLeft=par("usr")[1] TrueRight=par("usr")[2] if (inR) { chrhtadj=eventyadj if (new) title(list(ATitle,cex=tcex)) } else { chrhtadj=parcxy[2]*mcex*(eventyadj+0.25) if (new) title(ATitle,cex=tcex) } if (!is.null(overtitl) && (nfig==1 || igrp%%(rows*cols)==1)) { if (new) mtext(overtitl, outer=T, cex=ocex) } for (i in 1:N) { if (new) { for (j in 1:K) { lines(c(X[ord[i],j],X[ord[i],j+1])-timeadj, c(i,i), col=colors[j]) if (marks[ord[i],j]!="") text(X[ord[i],j]+markadj[j]-timeadj, i+chrhtadj, marks[ord[i],j], col=1, adj=ifelse(j==1,1,0.5), cex=mcex) if (vertsep[j]==T) lines(c(X[ord[i],j],X[ord[i],j])-timeadj, c(i-vertht,i+vertht), col=colors[j]) } } if (new && marks[ord[i],K+1]!="") text(X[ord[i],K+1]+markadj[K+1]-timeadj, i+chrhtadj, marks[ord[i],K+1], col=1, adj=0, cex=mcex) if (new && vertsep[K+1]==T) lines(c(X[ord[i],K+1],X[ord[i],K+1])-timeadj, c(i-vertht,i+vertht), col=1) # Handle "events" if (!is.null(eventsdtf)) { if (id=="") stop(paste("id= is missing")) if (is.na(match(id,names(gdtf)))) stop(paste(id,"is not in",deparse(substitute(gdtf)))) if (is.na(match(id,names(eventsdtf)))) stop(paste(id,"is not in",deparse(substitute(eventsdtf)))) if (eventdate=="") stop(paste("eventdate= is missing")) if (is.na(match(eventdate,names(eventsdtf)))) stop(paste(eventdate,"is not in",deparse(substitute(eventsdtf)))) evnts=eventsdtf[eventsdtf[,id]==gdtf[ord[i],id],eventdate] if (length(evnts)>0) { goodevent=is.na(evnts)==F if (is.character(eventchar) && !is.na(match(eventchar,names(eventsdtf)))) { jnk=eventsdtf[eventsdtf[,id]==gdtf[ord[i],id],eventchar] goodevent=(goodevent & is.na(jnk)==F) pch=paste(jnk[goodevent],collapse="") } else { pch=eventchar } if (any(goodevent)) { evnts=as.numeric(dates(as.character(evnts[goodevent]))) # Convert, e.g. days to weeks if (timediv!=1) evnts=evnts/timediv # Align data to "zero" point if (!is.null(zero)) evnts=evnts-sub[groups==grp,][ord[i]] evnts=evnts-timeadj evnts=evnts[evnts>=TrueLeft & evnts<=TrueRight] if (length(evnts)>0) { for(ii in 1:length(evnts)) points(evnts[ii], i+chrhtadj, pch=substring(pch,ii,ii), cex=ecex) } } } } } # Save as a postscript file if (igrp%%(rows*cols)==0 || igrp==length(groupids)) { if (printit) { dev.print(); if (!inR) dev.off() # Splus only: guiPrint("GraphSheet") } if (!is.null(file)) { tmp=paste(file,FileSep[ceiling(igrp/rows/cols)],sep="") dev.copy(postscript,tmp,horiz=TRUE) dev.off() # Window/Splus only option # export.graph(Name="GSD2", Graph="", ExportType="JPG", FileName=tmp, # OutputFile="", ColorBits="", QFactor="", Progressive=F, # NumPasses="", Height="", Width="", Units="") } if (pause && igrp!=length(groupids)) { print("Click on the graph to continue") locator(1) } } } # end for each group invisible(ord) }