# Scoring algorithm for ELSA data in Chapter 4 # Ardo , UCL 2016 ################################################## # Data are not available. Access to ELSA data can # be requested via the Economic and Social # Data Service (www.esds.ac.uk). # # Data format needed for this code: # # id state age sex educ # 1 2 1 1 0 # 1 3 3 1 0 # 1 4 5 1 0 # 1 4 7 1 0 # 2 2 1 0 0 # 2 2 3 0 0 etc. # # # Typically age is a shifted version of age in years # # The models below are defined by # 1. Two-column matrices to determine which # transitions are modelled by the intercept, age, # and covariates # 2. Vectors <..free> to identify the rows of the # free parameters in # 3. Matrices <..fixed> with no. rows equal to the # length of vector <..free> to specify the # link between free and fixed parameters # # #################################################### # Prelim: digits <- 3 set.seed(12345) # Prepare data for quick access: da <- new.env() for(i in 1:N){ data.i <- dta[dta$id==subjects[i],] eval(parse(text=paste(paste("da$d", as.character(i),sep=""),"<-data.i",sep=""))) } da <- as.list(da) cat("Data prepared.\n") # Data info: subjects <- as.numeric(names(table(dta$id))) N <- length(subjects) cat("Sample size =",N,"\n") cat("Frequencies observed state:"); print(table(dta$state)) cat("State table:"); print(statetable.msm(state,id,data=dta)) # Definitions for states and transitions: # Dead state: D <- 5 # Intercepts: Qind <- cbind(c(1,1,2,2,2,3,3,3,4,4),c(2,D,1,3,D,2,4,D,3,D)) npar <- nrow(Qind) qnames <- rep("q0",npar) # Intercept only?: NULLMODEL <- FALSE if(NULLMODEL){ # Total no. of parameters: nparTotal <- npar MODEL <- 0 }else{ # Age: QindA <- cbind(c(1,1,2,2,3,3,4),c(2,D,3,D,4,D,D)) # Choose baseline distributions (only for age-dependent transitions!): # Select by using/not-using #-sign: MODEL <- 1; baseline <- rep("Gomp",7) #MODEL <- 2; baseline <- c("Gomp","Weib", "Gomp","Weib","Gomp","Weib","Weib") #MODEL <- 3; baseline <- rep("Weib",7) #MODEL <- 4; baseline <- c("Weib", "Gomp","Weib","Gomp","Weib","Gomp","Gomp") # Check: if(length(baseline)!=nrow(QindA)){ stop("\nERROR. Number of specified baseline distr. not correct.\n") } A.free <- c(1,2,3,5) nparA <- length(A.free) A.fixed <- matrix(0,length(A.free),nrow(QindA)) A.fixed[1,1] <- 1 A.fixed[2,] <- c(0,1,0,1,0,1,1) A.fixed[3,3] <- 1 A.fixed[4,5] <- 1 qnames <- c(qnames,rep("qAge",nparA)) # Total no. of parameters: nparTotal <- npar+nparA } # Number of covariates: ncov <- 2 # Enforce ncov=0 for NULLMODEL: if(NULLMODEL){ncov <- 0} # Define covariates effects if any: if(ncov>0){ # X1: QindX1 <- cbind(c(1,1,2,2,3,3,4),c(2,D,3,D,4,D,D)) X1.free <- c(1,2,3,5) nparX1 <- length(X1.free) X1.fixed <- matrix(0,length(X1.free),nrow(QindX1)) X1.fixed[1,1] <- 1 X1.fixed[2,] <- c(0,1,0,1,0,1,1) X1.fixed[3,3] <- 1 X1.fixed[4,5] <- 1 qnames <- c(qnames,rep("qX1",nparX1)) # X2: QindX2 <- cbind(c(1,2,3),c(2,3,4)) nparX2 <- nrow(QindX2) X2.fixed <- diag(nrow(QindX2)) qnames <- c(qnames,rep("qX2",nparX2)) # Total no. of parameters: nparTotal <- npar+nparA+nparX1+nparX2 } ############################################################# # Functions for scoring: # Building Q matrix: Qmatrix <- function(beta,x,x1,x2){ Q <- matrix(0,D,D) for(k in 1:npar){ Q[Qind[k,1],Qind[k,2]] <- exp(beta[k]) } if(!NULLMODEL){ for(k in 1:nparA){ for(l in 1:ncol(A.fixed)){ if(baseline[l]=="Gomp"){ if(A.fixed[k,l]==1){ Q[QindA[l,1],QindA[l,2]] <- Q[QindA[l,1],QindA[l,2]]*exp(beta[npar+k]*x) } } if(baseline[l]=="Weib"){ if(A.fixed[k,l]==1){ Q[QindA[l,1],QindA[l,2]] <- Q[QindA[l,1],QindA[l,2]]*exp(beta[npar+k])*x^(exp(beta[npar+k])-1) } } } } } # Covariates: if(ncov>0){ for(k in 1:nparX1){ for(l in 1:ncol(X1.fixed)){ if(X1.fixed[k,l]==1){ Q[QindX1[l,1],QindX1[l,2]] <- Q[QindX1[l,1],QindX1[l,2]]*exp(beta[npar+nparA+k]*x1) } } } for(k in 1:nparX2){ for(l in 1:ncol(X2.fixed)){ if(X2.fixed[k,l]==1){ Q[QindX2[l,1],QindX2[l,2]] <- Q[QindX2[l,1],QindX2[l,2]]*exp(beta[npar+nparA+nparX1+k]*x2) } } } } for(k in 1:(D-1)){Q[k,k] <- -sum(Q[k,])} # Return: return(Q) } # P matrix using eigenvalue decomposition: matrixexp <- function(Q,t){ Eigen <- eigen(Q) A <- Eigen$vectors d <- Eigen$values D <- diag(exp(d*t)) # Return: return(A%*%D%*%solve(A)) } # Building V: V <- function(G,d,t){ V0 <- matrix(0,D,D) for(i in 1:D){ for(j in 1:D){ if(i==j){ V0[i,j] <- G[i,i]*t*exp(d[i]*t) }else{ V0[i,j] <- G[i,j]*(exp(d[i]*t)-exp(d[j]*t))/(d[i]-d[j]) } } } # Return: return(V0) } # Derivative P matrix using eigenvalue decomposition: DerivP <- function(Q,t,x,x1,x2){ # Decomposition: Eigen <- eigen(Q, symmetric=FALSE) A <- Eigen$vectors d <- Eigen$values # Derivative matrices: Deriv <- array(NA,c(nparTotal,D,D)) for(k in 1:npar){ derivQ <- matrix(0,D,D) derivQ[Qind[k,1],Qind[k,2]] <- Q[Qind[k,1],Qind[k,2]] derivQ[Qind[k,1],Qind[k,1]] <- -derivQ[Qind[k,1],Qind[k,2]] G <- solve(A)%*%derivQ%*%A Deriv[k,1:D,1:D] <- A%*%V(G,d,t)%*%solve(A) } if(!NULLMODEL){ # Derivative for A: for(k in 1:nparA){ derivQ <- matrix(0,D,D) for(l in 1:ncol(A.fixed)){ if(A.fixed[k,l]==1){ if(baseline[l]=="Gomp"){ derivQ[QindA[l,1],QindA[l,2]] <- x*Q[QindA[l,1],QindA[l,2]] derivQ[QindA[l,1],QindA[l,1]] <- -derivQ[QindA[l,1],QindA[l,2]] } if(baseline[l]=="Weib"){ derivQ[QindA[l,1],QindA[l,2]] <- (1+exp(beta[npar+k])*log(x))*Q[QindA[l,1],QindA[l,2]] derivQ[QindA[l,1],QindA[l,1]] <- -derivQ[QindA[l,1],QindA[l,2]] } } } G <- solve(A)%*%derivQ%*%A Deriv[npar+k,1:D,1:D] <- A%*%V(G,d,t)%*%solve(A) } } # Covariates: if(ncov>0){ # Derivative for X1: for(k in 1:nparX1){ derivQ <- matrix(0,D,D) for(l in 1:ncol(X1.fixed)){ if(X1.fixed[k,l]==1){ derivQ[QindX1[l,1],QindX1[l,2]] <- x1*Q[QindX1[l,1],QindX1[l,2]] derivQ[QindX1[l,1],QindX1[l,1]] <- -derivQ[QindX1[l,1],QindX1[l,2]] } } G <- solve(A)%*%derivQ%*%A Deriv[npar+nparA+k,1:D,1:D] <- A%*%V(G,d,t)%*%solve(A) } # Derivative for X2: for(k in 1:nparX2){ derivQ <- matrix(0,D,D) for(l in 1:ncol(X2.fixed)){ if(X2.fixed[k,l]==1){ derivQ[QindX2[l,1],QindX2[l,2]] <- x2*Q[QindX2[l,1],QindX2[l,2]] derivQ[QindX2[l,1],QindX2[l,1]] <- -derivQ[QindX2[l,1],QindX2[l,2]] } } G <- solve(A)%*%derivQ%*%A Deriv[npar+nparA+nparX1+k,1:D,1:D] <- A%*%V(G,d,t)%*%solve(A) } } # Return: return(Deriv) } ################################## # Scoring algorithm: # Starting values: # For models without covariates: if(NULLMODEL){ p0 <- rep(-3,npar) } if(!NULLMODEL & ncov==0 & MODEL==1){ p0 <- c(rep(-3,npar),rep(0,nparA)) } # For models with two covariates: if(ncov>0){ # For Gompertz model: if(MODEL==1){ p0 <- c(rep(-3,npar),rep(0,nparA),rep(0,nparX1),rep(0,nparX2)) } # For model with Weibull mortality: if(MODEL==2){ p0 <- c(-3,-10,-3,-3,-10,-3,-3,-10,-3,-10, 0,.5,0,0, rep(0,nparX1),rep(0,nparX2)) } # For Weibull model: if(MODEL==3){ p0 <- c(rep(-10,npar), rep(.5,nparA), rep(0,nparX1),rep(0,nparX2)) } # For model with Gompertz mortality: if(MODEL==4){ p0 <- c(-10,-3,-10,-10,-3,-10,-10,-3,-10,-3, 0.5,0,.5,.5, rep(0,nparX1),rep(0,nparX2)) } } beta <- p0 cat("\nStarting values",beta,"\n") # Parametrs for no. of interations: iter <- 1 max.iter <- 50 diff <- rep(Inf,length(beta)) # Precision: abstol <- 1e-06 # Function to build score and info: Scoring <- function(beta){ S <- rep(0,nparTotal) M <- matrix(0,nparTotal,nparTotal) loglik <- 0 for(i in 1:N){ # Data and prelim for subject i: eval(parse(text=paste("data.i<-",paste("da$d",as.character(i),sep=""),sep=""))) O <- data.i$state age <- data.i$age x1 <- data.i$sex[1] x2 <- data.i$educ[1] # Loop over individual follow-up: for(j in 2:length(O)){ t <- age[j]-age[j-1] x <- age[j-1] # Q matrix: Q <- Qmatrix(beta,x=x,x1=x1,x2=x2) # P matrix and derivative: P <- matrixexp(Q,t=t) Deriv <- DerivP(Q,t,x=x,x1=x1,x2=x2) # Likelihood contribution: if(O[j]==D){ contribution <- P[O[j-1],1:(D-1)]%*%Q[1:(D-1),D] }else{ contribution <- P[O[j-1],O[j]] } loglik <- loglik+log(contribution) # Scoring contribution: S.i <- rep(0,nparTotal) if(O[j]==D){ # For all parameters: denom <- P[O[j-1],1:(D-1)]%*%Q[1:(D-1),D] for(k in 1:nparTotal){ S.i[k] <- Q[1:(D-1),D]%*%Deriv[k,O[j-1],1:(D-1)]/denom } # For intercept: for(k in 1:npar){ if(Qind[k,2]==D){ S.i[k] <- S.i[k] + Q[Qind[k,1],D]*P[O[j-1],Qind[k,1]]/denom} } if(!NULLMODEL){ # For age: for(k in 1:nparA){ for(l in 1:ncol(A.fixed)){ if(A.fixed[k,l]==1){ if(QindA[l,2]==D){ if(baseline[l]=="Gomp"){ S.i[npar+k] <- S.i[npar+k] + x*Q[QindA[l,1],D]*P[O[j-1],QindA[l,1]]/denom } if(baseline[l]=="Weib"){ S.i[npar+k] <- S.i[npar+k] + (1+exp(beta[npar+k])*log(x))*Q[QindA[l,1],D]*P[O[j-1],QindA[l,1]]/denom } } } } } } # Covariates: if(ncov>0){ # For X1: for(k in 1:nparX1){ for(l in 1:ncol(X1.fixed)){ if(X1.fixed[k,l]==1){ if(QindX1[l,2]==D){ S.i[npar+nparA+k] <- S.i[npar+nparA+k] + x1*Q[QindX1[l,1],D]*P[O[j-1],QindX1[l,1]]/denom } } } } # For X2: for(k in 1:nparX2){ for(l in 1:ncol(X2.fixed)){ if(X2.fixed[k,l]==1){ if(QindX2[l,2]==D){ S.i[npar+nparA+nparX1+k] <- S.i[npar+nparA+nparX1+k] + x2*Q[QindX2[l,1],D]*P[O[j-1],QindX2[l,1]]/denom } } } } } } if(O[j]!=D){ # For all parameters: denom <- P[O[j-1],O[j]] for(k in 1:nparTotal){ S.i[k] <- Deriv[k,O[j-1],O[j]]/denom } } # Update scoring and information: S <- S+S.i for(k in 1:nparTotal){ for(l in 1:nparTotal){ M[k,l] <- M[k,l] + S.i[k]*S.i[l] } } } } return(list(S=S,M=M,minus2loglik=-2*loglik)) } # Scoring algorithm: while(iter<=max.iter & sum(diff) > abstol){ # Scoring iteration SandM <- Scoring(beta) S <- SandM$S M <- SandM$M minus2loglik <- SandM$minus2loglik # Update beta: beta.old <- beta beta <- beta+solve(M)%*%S # Monitor convergence cat("Iteration =",iter,"\n-2Loglik =",minus2loglik,"\n") diff <- abs(beta.old-beta) cat("Updated parameters:\n",round(c(beta),digits),"\n") cat("Sum of absolute difference with old ones =",sum(diff),"\n\n") # Next iteration: iter <- iter+1 } # Results: cat("\nResults using",iter-1,"iterations (transformed parameters)\n") cat("and starting values: ",p0,"\n") p <- beta p.se <- sqrt(diag(solve(M))) print(cbind(q=qnames,p=round(p,digits), se=round(p.se,digits)),quote=FALSE) # Inference for Weibull tau-parameter in Model 2: if(MODEL==2){ variance <- diag(solve(M)) variance[12] <- variance[12]*(exp(p[12])^2) cat("For tau:\n") print(c(exp(p[12]),sqrt(variance[12]))) } # Information criterions: cat("With -2*loglik = ",minus2loglik,"\n") cat("and AIC = ",minus2loglik+2*length(beta),"\n") cat("and BIC = ",minus2loglik+log(N)*length(beta),"\n")