







Study with the several resources on Docsity
Earn points by helping other students or get them with a premium plan
Prepare for your exams
Study with the several resources on Docsity
Earn points to download
Earn points by helping other students or get them with a premium plan
Introduction to Statistical Learning (James/Witten/Hastie/Tibshirani)
Typology: Exercises
1 / 13
This page cannot be seen from the preview
Don't miss anything!








#----- save_plots = F set.seed(0) vt100ClearScreen <- function(...) cat("\033[2J") vt100ClearScreen() DM = matrix( data=c(0.0,0.3,0.4,0.7,0.3,0.0,0.5,0.8,0.4,0.5,0.0,0.45,0.7,0.8,0.45, .0), nrow=4, ncol=4, byrow=TRUE ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_2_complete_linkage.eps", onefile=FALSE, horizontal=FALSE) } plot( hclust( as.dist( DM ), method="complete" ), main="complete linkage" ) if( save_plots ){ dev.off() } if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_2_single_linkage.eps", onefile=FALSE, horizontal=FALSE) } plot( hclust( as.dist( DM ), method="single" ), main="single linkage" ) if( save_plots ){ dev.off() }
#----- save_plots = F set.seed(0) vt100ClearScreen <- function(...) cat("\033[2J") vt100ClearScreen()
DF = data.frame( x1=c(1,1,0,5,6,4), x2=c(4,3,4,1,2,0) ) n = dim(DF)[1] K = 2
labels = sample( 1:K, n, replace=TRUE ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_3_initial_plot.eps", onefile=FALSE, horizontal=FALSE) } plot( DF$x1, DF$x2, cex=2.0, pch=19, col=(labels+1), xlab="gene index", ylab="unpaired t-value" ) grid() if( save_plots ){ dev.off() } while( TRUE ){
cents = matrix( nrow=K, ncol=2 ) for( l in 1:K ){ samps = labels==l cents[l,] = apply( DF[samps,], 2, mean ) }
new_labels = rep( NA, n ) for( si in 1:n ){ smallest_norm = +Inf for( l in 1:K ){ nm = norm( as.matrix( DF[si,] - cents[l,] ), type="2" )
#----- save_plots = FALSE set.seed(0) vt100ClearScreen <- function(...) cat("\033[2J") vt100ClearScreen()
n = 1000 # the number of genes m = 100 # the number of tissue samples ##n = 20 ##m = 5 mu_A = 0.1 ## the machines are mostly the same but a bit different sigma_A = 1. mu_B = -0. sigma_B = 4. mu_C = 0.0 # the control and the treatment means are similar mu_T = 0. X = matrix(0, nrow=n, ncol=m) prob_of_machine_A = seq(1, 1.e-6, length.out=m) machine = c() treatment = c() for( jj in 1:m ){
machine_used = sample(c('A', 'B'), size=1, prob=c(prob_of_machine_A[jj], 1-prob_of_machine_A[jj])) machine = c(machine, machine_used)
type = sample(c('C', 'T'), size=1, prob=c(0.5, 0.5)) treatment = c(treatment, type) if( machine_used=='A' ){ if( type=='C' ){ x = rnorm(n, mean=(mu_A+mu_C), sd=sigma_A) }else{ x = rnorm(n, mean=(mu_A+mu_T), sd=sigma_A)
}else{ if( type=='C' ){ x = rnorm(n, mean=(mu_B+mu_C), sd=sigma_B) }else{ x = rnorm(n, mean=(mu_B+mu_T), sd=sigma_B) } } X[, jj] = x } pr.out = prcomp(X, scale=TRUE) ##print(summary(pr.out))
print(pr.out$sdev[1:10]^2/sum(pr.out$sdev^2))
X_transformed = X - pr.out$x[,1] %*% t(pr.out$rotation[, 1])
print(t.test(X_transformed[, treatment=='C'], X_transformed[, treatment=='T']))
machine_A = machine=='A' X_A = X[, machine_A] machine_B = machine=='B' X_B = X[, machine_B] print(sprintf('mu_A= %f; mean(X_A)= %f; mu_B= %f; mean(X_B)= %f', mu_A, mean(X_A), mu_B, mean(X_B))) X_2 = cbind(X_A - mean(X_A), X_B - mean(X_B)) pr.out = prcomp(X_2, scale=TRUE) ##print(summary(pr.out))
print(pr.out$sdev[1:10]^2/sum(pr.out$sdev^2))
print(t.test(X_2[, treatment=='C'], X_2[, treatment=='T']))
#----- save_plots = FALSE set.seed(0) vt100ClearScreen <- function(...) cat("\033[2J") vt100ClearScreen() pr.out = prcomp(USArrests, scale=TRUE)
pr.var = pr.out$sdev^ pve_1 = pr.var / sum(pr.var)
USArrests_scaled = scale( USArrests ) denom = sum( apply( USArrests_scaled^2, 2, sum ) ) Phi = pr.out$rotation USArrests_projected = USArrests_scaled %*% Phi # this is the same as pr.out$x numer = apply( pr.out$x^2, 2, sum ) pve_2 = numer / denom print(pve_1) print(pve_2) print(pve_1 - pve_2)
#----- save_plots = F set.seed(0) vt100ClearScreen <- function(...) cat("\033[2J") vt100ClearScreen()
hclust.complete = hclust( dist(USArrests), method="complete" ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_9_complete_linkage.eps", onefile=FALSE, horizontal=FALSE) } plot( hclust.complete, xlab="", sub="", cex=0.9 ) if( save_plots ){ dev.off() } #cutree( hclust.complete, h=150 ) # height we cut at ct = cutree( hclust.complete, k=3 ) # number of clusters to cut into
for( k in 1:3 ){ print(k) print( rownames( USArrests )[ ct == k ] ) }
hclust.complete.scale = hclust( dist(scale(USArrests,center=FALSE)), method="complete" ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_9_complete_linkage_w_scaling .eps", onefile=FALSE, horizontal=FALSE) } plot( hclust.complete.scale, xlab="", sub="", cex=0.9 ) if( save_plots ){ dev.off() } ct = cutree( hclust.complete.scale, k=3 ) # number of clusters to cut into
#----- save_plots = FALSE set.seed(0) vt100ClearScreen <- function(...) cat("\033[2J") vt100ClearScreen()
K = 3 # the number of classes n = 20 # the number of samples per class p = 50 # the number of variables
X_1 = matrix( rnorm(n*p), nrow=n, ncol=p ) for( row in 1:n ){ X_1[row,] = X_1[row,] + rep( 1, p ) }
X_2 = matrix( rnorm(n*p), nrow=n, ncol=p ) for( row in 1:n ){ X_2[row,] = X_2[row,] + rep( -1, p ) }
X_3 = matrix( rnorm(n*p), nrow=n, ncol=p ) for( row in 1:n ){ X_3[row,] = X_3[row,] + c( rep( +1, p/2 ), rep( -1, p/2 ) ) } X = rbind( X_1, X_2, X_3 ) labels = c( rep(1,n), rep(2,n), rep(3,n) ) # the "true" labels of the points pr.out = prcomp( X, scale=TRUE ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_10_pr_plot.eps", onefile=FALSE, horizontal=FALSE) } plot( pr.out$x[,1], pr.out$x[,2], col=labels, pch=19 ) grid() if( save_plots ){ dev.off() }
kmean.out = kmeans( X, centers=3, nstart=50 ) table( kmean.out$cluster, labels )
kmean.out = kmeans( X, centers=2, nstart=50 )
kmean.out = kmeans( X, centers=4, nstart=50 )
kmean.out = kmeans( pr.out$x[,c(1,2)], centers=3, nstart=50 )
Xs = scale( X ) kmean.out = kmeans( Xs, centers=3, nstart=50 )
n2 = apply( DF[ predicted==2, ], 2, length ) m1 = apply( DF[ predicted==1, ], 2, mean ) # the means across the 1000 genes in each cluster m2 = apply( DF[ predicted==2, ], 2, mean ) v1 = apply( DF[ predicted==1, ], 2, var ) # the variances across the 1000 genes in each cluster v2 = apply( DF[ predicted==2, ], 2, var ) pooled_variance = sqrt( v1 / n1 + v2 / n2 ) t_value = ( m1 - m2 ) / pooled_variance if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_11_t_value.eps", onefile=FALSE, horizontal=FALSE) } plot( t_value, xlab="gene index", ylab="unpaired t-value" ) if( save_plots ){ dev.off() }