# Splus exploratory data analysis for survival data with subject epochs
# Dec 2000  Howard Seltman and Terri Daigneault
# See SurvEDATest.q

# SurvEDA() plots data with multiple event times per subject.
# dtf is the name of the data.frame (usually created with read.table()).
# epochs is a vector of column numbers or names that specify (in order)
#      the event times in "dtf". The columns can be numeric, string dates,
#      or factor dates.
# zero is the name or number of the column which is assigned time zero.
#      This time is lined up on the plot for all subjects.
# markers is a vector of column names, blanks, or characters that indicate
#      what goes on the plot before the first time, at each subsequent
#      event time, and after the last event.  It's length may be as large as
#      length(epochs), or shorter if latter positons need no markers.
#      E.g. c("id","","X") puts the subject's id to the left of the time
#      line, and puts an X between events 2 and 3.  (Note: markers may also be
#      an n by length(epochs) character matrix.)
# markadj allows left/right adustment of the placement of the before-the-
#      timeline and after-the-timeline markers.  E.g. c(2,1) leaves 2
#      character's worth of space between the left label and the timeline,
#      and 1 character's worth between the timeline and the right label.
# mcex is a single number that sets the size of the marker text where 1
#      is standard size.
# markwidth is a numeric vector that specifies the maximum number of characters
#      displayed for each marker.
# vert is a vector of trues (T) and falses (F) that determines where
#      vertical separators are used to mark event times, e.g. c(T,F,T)
#      indicates vertical separators at the start of event 1 and between
#      events 2 and 3.  (F's are added if "vert" is too short.)
# colors is a vector of size 1 or length(epochs)-1 that specifies the colors
#      used for each time line segment
# sortcol is a number, column name or a pair of such that specifies the
#      ordering of the subjects in the plot.  If a pair is used, the
#      second column resolves ties.
# reverse can be set to T to put the highest sorted values toward the top
#      of the graph instead of the bottom.
# groups is a column names or number or a vector of the same length as the data
#      which specifies groups of subjects to be placed on separate plots.
# timediv is a time divisor number that e.g. can be set to 7 to change days
#      to weeks on the plot.
# timeadj is a number subtracted from X for plotting when zero is unspecified
# acex is a single number that sets the size of the text for the axis lables
#      where 1 is standard size and 1.3 is the default.
# tcex is a single number that sets the size of the text for the title(s)
#      where 1 is standard size and 1.3 is the default.
# xlab is the x axis label (default is "Time")
# ylab is the y axis label (default is "Subject")
# xladj is the number of extra character widths desired on the left (- for fewer).
# xradj is the number of extra character widths desired on the right (- for fewer).
# ltrim is a number, left of which timelines are trimmed
# rtrim is a number, right of which timelines are trimmed
# titl is a string or vector of strings that specify the titles for each 
#      group.  The default is the group ids (or the data.frame name if there
#      is only one group).
# rows and cols specify how to divide up the plot surface if you want
#      multiple plots per page.
# overtitl is the overall title, especially useful when there are multiple groups.
# ocex is a single number that sets the size of the text for the overall title
#      where 1 is standard size and 1.4 is the default.
# file is the name of a file to recieve the postscript plot (".ps" is added).
# horiz can be set to F is a vertical plot is desired for the output file.
# pause can be set to F so that when groups spill over to multiple plot
#      pages you won't get a "Click on the graph to continue" message.
# eventsdtf is the dataframe for events
# id is a string containing the column names to match dtf to eventsdtf
# eventdate is a string containing the event date column name
# eventyadj is a positive or negative value adjusting event y axis location
# eventchar is a character or dtf column name for plotting event characters
# ecex is the event character size
# new is a logical value; if F, events are added to an existing plot

SurvEDA=function(dtf, epochs, zero=NULL, 
                 markers=NULL, markadj=NULL, mcex=1, markwidth=NULL,
                 vert=NULL,
                 colors=2:(1+length(epochs)), sortcol=NULL, reverse=F,
                 groups=NULL, 
                 timediv=1, timeadj=0,
                 eventsdtf=NULL, id="", eventdate="", eventyadj=0,
                 eventchar="*", ecex=0.7,
                 acex=1.3, tcex=1.3,
                 xlab="Time", ylab="Subject", xladj=0, xradj=0,
                 ltrim=NULL, rtrim=NULL,
                 titl=NULL,
                 rows=1, cols=1, overtitl=NULL, ocex=1.4,
                 file=NULL, horiz=T, printit=F, pause=T, new=T) {

  # First argument must be a data.frame
  if (!is.data.frame(dtf))
    stop(paste(deparse(substitute(dtf)),"is not a data.frame!"))

  # Convert epochs to correct time scale
  Tim=as.matrix(dtf[,epochs])
  # Convert character or factor dates to days
  
  tmp=unlist(lapply(lapply(Tim,mode),match,table=c("numeric","character")))
  if (!all(tmp==1) && !all(tmp==2))
    stop("Epochs must be all numeric or all character dates")
  if (tmp[1]==2 || all(unlist(lapply(Tim,is.factor)))) {
    jnk=Tim
    Tim=matrix(NA,nrow(jnk),ncol(jnk))
    for (i in 1:ncol(Tim))
      Tim[,i]=as.numeric(dates(as.character(jnk[,i])))
  }
  K=ncol(Tim)-1
  # Convert, e.g. days to weeks
  if (timediv!=1)
    Tim=Tim/timediv
  # Align data to "zero" point
  if (!is.null(zero)) {
    if (mode(zero)!=mode(epochs))
      stop("epochs and zero must be of the same data type")
    Zero=(1:(K+1))[epochs==zero]
    sub=matrix(Tim[,Zero],nrow(Tim),K+1)
    Tim=Tim-sub
  }
  Trange=range(Tim,na.rm=T)
  if (!is.null(ltrim))
    Trange=c(max(Trange[1],ltrim), Trange[2])
  if (!is.null(rtrim))
    Trange=c(Trange[1], min(Trange[2],rtrim))
  Trange=Trange-timeadj

  # Setup groups
  if (is.null(groups)) {
    groups=rep(1,nrow(dtf))
    groupids=1
    if (is.null(titl))
      titl=deparse(substitute(dtf))
    grpcounts=nrow(dtf)
  } else {
    if (length(groups)>1 && length(groups)!=nrow(dtf))
      stop("groups must be of length 1 or nrow(dtf)")
    if (length(groups)==1)
      groups=dtf[,groups]
    groupids=sort(unique(groups))
    grpcounts=table(groups)
    if (min(grpcounts)==1 || length(grpcounts)!=length(groupids)) {
      print(grpcounts)
      print("Warning: Groups with <=1 member are not shown")
      grpcounts=grpcounts[grpcounts>0]
    }
    if (is.null(titl))
      titl=paste(as.character(groupids))
    else
      if (length(titl)!=length(groupids))
        stop("You need one title per group")
  }

  # Setup for multiple plots per page and for file saving 
  par(mfrow=c(rows,cols))
  if (!is.null(overtitl))
    par(oma=c(1,1,1.5+ocex,1))
  if (!is.null(file)) {
    if (!(substring(file,nchar(file)-2,nchar(file))==".ps"))
      tmp=".ps"
    else
      tmp=""
    if (length(groupids)>rows*cols)
      FileSep=paste(LETTERS[1:(ceiling(length(groupids)/rows/cols))], 
                    tmp, sep="")
    else
      FileSep=tmp
  }

  # Setup colors
  if (length(colors)<K) {
    if (K%%length(colors)!=0)
      stop("You need one color per time interval")
    else
      colors=rep(colors,K/length(colors))
  }

  # Setup adjustment of mark locations and x axis limits
  inR=exists("is.R") && is.R()==TRUE  # false implies Splus
  #if (inR) parcxy=par("cra") else parcxy=par("1em")
  parcxy=par("cxy")
  stdem=parcxy[1] / par("cex") / diff(par("usr")[1:2])
  totchr=1/stdem  # total characters that fit across plot surface
  Lextrachr=0
  Rextrachr=0
  LM=length(markers)
  if (!is.null(markers)) {
    if (is.null(markwidth))
      markwidth=rep(2,LM)
    tmp=length(markwidth)
    if (tmp<LM)
      markwidth=c(markwidth,rep(3,LM-tmp))
    if (markers[1]!="") {
      if (any(names(dtf)==markers[1]))   
        Lextrachr=min(markwidth[1],max(nchar(as.character(dtf[,markers[1]]))))
      else
        Lextrachr=min(markwidth[1],nchar(markers[1]))
    }
    if (LM>=K+1) {
      if (any(names(dtf)==markers[K+1]))
        Rextrachr=min(markwidth[K+1],max(nchar(as.character(dtf[,markers[K+1]]))))
      else
        Rextrachr=min(markwidth[K+1],nchar(markers[K+1]))
    }
  }
  if (!is.null(markadj)) {
    Lextrachr=Lextrachr+markadj[1]
    if (length(markadj)>1)
      Rextrachr=Lextrachr+markadj[2]
  }
  datawidth=totchr-Lextrachr-Rextrachr
  chrusr=diff(Trange)/datawidth*mcex   # width of marker char in usr units
  if (is.null(markadj))
    markadj=c(-chrusr*0.5,rep(0,K-1),chrusr*0.5)
  else 
    markadj=c(-chrusr*markadj[1], rep(0,K-1),
               chrusr*ifelse(length(markadj)==1,0.5,markadj[2]))
  # preparations for plotting
  Trange[1]=Trange[1]-chrusr*(Lextrachr+xladj)
  Trange[2]=Trange[2]+chrusr*(Rextrachr+xradj)


  # For each group
  nfig=rows*cols
  for (igrp in 1:length(groupids)) {
    if (grpcounts[igrp]<2)
      next
    grp=groupids[igrp]
    gdtf=dtf[groups==grp,]
    ATitle=titl[igrp]
 
    # Number of subjects
    N=nrow(gdtf)

    # Time (epoch) data stored in X
    X=Tim[groups==grp,]

    # Set up for sorting
    if (is.null(sortcol))
      ord=1:N
    else if (is.character(sortcol[1])) { 
      if (length(sortcol)>2)
        stop("Only 2 sort columns implemented")
      for (i in 1:length(sortcol)) {
        if (!any(names(gdtf)==sortcol[i]))
          stop(paste("sortcol",sortcol,"not in",deparse(substitute(gdtf))))
      }
      if (length(sortcol)==1)
        ord=order(gdtf[,sortcol])
      else
        ord=order(gdtf[,sortcol[1]],gdtf[,sortcol[2]])
    } else {
      if (any(sortcol<1) || any(sortcol>K))
        stop(paste("sortcol must be between 1 and",K))
      dif=t(apply(as.matrix(X),1,function(x)diff(unlist(x))))
      if (length(sortcol)==1)
        ord=order(dif[,sortcol])
      else
        ord=order(dif[,sortcol[1]],dif[,sortcol[2]])
    }
    if (!reverse)
      ord=rev(ord)

    # Set up subject timeline marks (labels)
    marks=matrix("",N,K+1)
    if (!is.null(markers)) {
      if (is.matrix(markers)) {
        marks=markers
      } else {
        for (i in 1:LM) {
          if (markers[i]!="") {
            if (any(names(gdtf)==markers[i]))
              marks[,i]=substring(as.character(gdtf[,markers[i]]),1,markwidth[i])
            else
              marks[,i]=rep(markers[i],N)
          }
        }
      }
    }

    # Setup vertical timeline segement separators
    vertsep=rep(F,K+1)
    if (!is.null(vert)) {
      if (mode(vert)!="logical" || any(is.na(vert)))
        stop("vert must be only T's and F's")
      vertsep[1:length(vert)]=vert
      vertht=min(0.4,0.15)
    }

    # preparations for plotting
    par(mar=c(6.1,6.1,5.1,2.1)+(acex>1)*c(acex-1,acex-1,0,0))

    # Plotting
    if (new) {
      if (inR) {
        plot(Trange, c(0,N+1), xlab=xlab, ylab=ylab, type="n", cex.lab=acex)
      } else {
        plot(Trange, c(0,N+1), xlab=xlab, ylab=ylab, type="n", cex=acex)
      }
    }
    TrueLeft=par("usr")[1]
    TrueRight=par("usr")[2]
    if (inR) {
      chrhtadj=eventyadj
      if (new) title(list(ATitle,cex=tcex))
    } else {
      chrhtadj=parcxy[2]*mcex*(eventyadj+0.25)
      if (new) title(ATitle,cex=tcex)
    }
    if (!is.null(overtitl) && (nfig==1 || igrp%%(rows*cols)==1)) {
        if (new) mtext(overtitl, outer=T, cex=ocex)
    }
    for (i in 1:N) {
      if (new) {
        for (j in 1:K) {
          lines(c(X[ord[i],j],X[ord[i],j+1])-timeadj, c(i,i), col=colors[j])
          if (marks[ord[i],j]!="")
            text(X[ord[i],j]+markadj[j]-timeadj, i+chrhtadj, marks[ord[i],j], 
                 col=1, adj=ifelse(j==1,1,0.5), cex=mcex)
          if (vertsep[j]==T)
            lines(c(X[ord[i],j],X[ord[i],j])-timeadj,
                  c(i-vertht,i+vertht), col=colors[j])
        }
      }
      if (new && marks[ord[i],K+1]!="")
        text(X[ord[i],K+1]+markadj[K+1]-timeadj, i+chrhtadj, 
             marks[ord[i],K+1], col=1, adj=0, cex=mcex)

      if (new && vertsep[K+1]==T)
        lines(c(X[ord[i],K+1],X[ord[i],K+1])-timeadj,
              c(i-vertht,i+vertht), col=1)      

      # Handle "events"
      if (!is.null(eventsdtf)) {
        if (id=="")
          stop(paste("id= is missing"))
        if (is.na(match(id,names(gdtf))))
          stop(paste(id,"is not in",deparse(substitute(gdtf))))
        if (is.na(match(id,names(eventsdtf))))
          stop(paste(id,"is not in",deparse(substitute(eventsdtf))))
        if (eventdate=="")
          stop(paste("eventdate= is missing"))
        if (is.na(match(eventdate,names(eventsdtf))))
          stop(paste(eventdate,"is not in",deparse(substitute(eventsdtf))))
        evnts=eventsdtf[eventsdtf[,id]==gdtf[ord[i],id],eventdate]
        if (length(evnts)>0) {
          goodevent=is.na(evnts)==F
          if (is.character(eventchar) && 
              !is.na(match(eventchar,names(eventsdtf)))) {
            jnk=eventsdtf[eventsdtf[,id]==gdtf[ord[i],id],eventchar]
            goodevent=(goodevent & is.na(jnk)==F)
            pch=paste(jnk[goodevent],collapse="")
          } else {
            pch=eventchar
          }
          if (any(goodevent)) {
            evnts=as.numeric(dates(as.character(evnts[goodevent])))
            # Convert, e.g. days to weeks
            if (timediv!=1)
              evnts=evnts/timediv
            # Align data to "zero" point
            if (!is.null(zero))
              evnts=evnts-sub[groups==grp,][ord[i]]
            evnts=evnts-timeadj
            evnts=evnts[evnts>=TrueLeft & evnts<=TrueRight]
            if (length(evnts)>0) {
              for(ii in 1:length(evnts))
                points(evnts[ii], i+chrhtadj, 
                       pch=substring(pch,ii,ii), cex=ecex)
            }
          }
        }
      }
    }

    # Save as a postscript file
    if (igrp%%(rows*cols)==0 || igrp==length(groupids)) {
      if (printit) {
        dev.print();
        if (!inR) dev.off()
        # Splus only: guiPrint("GraphSheet")
      }
      if (!is.null(file)) {
        tmp=paste(file,FileSep[ceiling(igrp/rows/cols)],sep="")
        dev.copy(postscript,tmp,horiz=TRUE)
        dev.off()
        # Window/Splus only option
        # export.graph(Name="GSD2", Graph="", ExportType="JPG", FileName=tmp,
        #     OutputFile="", ColorBits="", QFactor="", Progressive=F,
        #     NumPasses="", Height="", Width="", Units="")
      }
      if (pause && igrp!=length(groupids)) {
        print("Click on the graph to continue")
        locator(1)
      }
    }
  } # end for each group

  invisible(ord)
}