R code

Some gsea

###################################################
### chunk: ALL
###################################################
if(!exists("biocLite")) {
source("http://www.bioconductor.org/biocLite.R")
}

is.ALL.found=require("ALL")
if(!is.ALL.found) {
biocLite("ALL")
}

is.genefilter.found = require("genefilter")
if(!is.genefilter.found) {
biocLite("genefilter")
}

is.KEGGdb.found = require("KEGG.db")
if(!is.KEGGdb.found) {
biocLite("KEGG.db")
}

library("ALL")
data("ALL")

###################################################
### chunk: bcrabl
###################################################
bcell = grep("^B", as.character(ALL$BT))
moltyp = which(as.character(ALL$mol.biol)
%in% c("NEG", "BCR/ABL"))
ALL_bcrneg = ALL[, intersect(bcell, moltyp)]
ALL_bcrneg$mol.biol = factor(ALL_bcrneg$mol.biol)

###################################################
### chunk: nsFilter
###################################################
library("genefilter")
ALLfilt_bcrneg = nsFilter(ALL_bcrneg, var.cutoff=0.5)$eset

###################################################
### chunk: KEGGimat
###################################################
library("GSEABase")
gsc = GeneSetCollection(ALLfilt_bcrneg,
setType=KEGGCollection())
Am = incidence(gsc)
dim(Am)

###################################################
### chunk: EsetfromKEGG
###################################################
nsF = ALLfilt_bcrneg[colnames(Am),]

###################################################
### chunk: compttests
###################################################
rtt = rowttests(nsF, "mol.biol")
rttStat = rtt$statistic

###################################################
### chunk: reducetoInt
###################################################
selectedRows = (rowSums(Am)>10)
Am2 = Am[selectedRows, ]

###################################################
### chunk: pctests
###################################################
tA = as.vector(Am2 %*% rttStat)
tAadj = tA/sqrt(rowSums(Am2))
names(tA) = names(tAadj) = rownames(Am2)

###################################################
### chunk: qqplot
###################################################
x11()
qqpos=qqnorm(tAadj)
o = order(tAadj, decreasing=FALSE)
scan()
xpos = qqpos$x[o[1]]
ypos = qqpos$y[o[1]]
points(x = xpos, y = ypos, col = "red",
pch = 19, cex = 1.5)
legend(legend=KEGGPATHID2NAME[[names(tAadj)[o[1]]]],
x = xpos,
y = ypos + 1,
text.col = "red",
bty="n", cex = 1.5)
scan()

###################################################
### chunk: findSmPW
###################################################
library("KEGG.db")
smPW = tAadj[tAadj == min(tAadj, na.rm=T)]
pwName = KEGGPATHID2NAME[[names(smPW)]]
pwName

###################################################
### chunk: mnplot
###################################################
x11()
KEGGmnplot(names(smPW), nsF, "hgu95av2", nsF$"mol.biol",
pch=16, col="darkblue")
scan()

###################################################
### chunk: heatmap
###################################################
x11()
sel = as.integer(nsF$mol.biol)
KEGG2heatmap(names(smPW), nsF, "hgu95av2",
col=colorRampPalette(c("white", "darkblue"))(256),
ColSideColors=c("black", "white")[sel])
scan()

Babies 1: qqplot with marginal histograms

# A function for drawing QQ-plot of 2 distributions along with marginal histograms

qq.hist = function(x=rnorm(1000), y=rnorm(1000), xlab="X", ylab="Y")
{

# calculate quantiles
xq = quantile(x, probs = seq(0,1,0.01))
yq = quantile(y, probs = seq(0,1,0.01))

# calculate histogram values
minval = floor(min(c(x,y)))-1
maxval = ceiling(max(c(x,y)))+1
xhist = hist(x, breaks=seq(minval,maxval,0.5), plot=FALSE)
yhist = hist(y, breaks=seq(minval,maxval,0.5), plot=FALSE)

top <- max(c(xhist$counts, yhist$counts))
xrange <- c(minval,maxval)
yrange <- c(minval,maxval)

# layout
nf <- layout(matrix(c(2,4,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), TRUE)
layout.show(nf)

# draw the qq-plot
par(mar=c(4,4,1,1))
qqplot(x, y, xlim=xrange, ylim=yrange, pch=".", cex=2, main="", xlab = xlab, ylab=ylab, cex.lab=1.5)
abline(h=0,v=0,col="blue")

# histogram for the x-axis
par(mar=c(0,3,1,1))
barplot(xhist$counts, axes=F, ylim=c(0, top), space=0)

# histogram for the y-axis
par(mar=c(3,0,1,1))
barplot(yhist$counts, axes=F, xlim=c(0, top), space=0, horiz=TRUE)

}

Babies 1: fig 1.8

# A function for calculating coefficient of kurtosis
kurt = function(x)
{
return(mean(((x-mean(x, na.rm=T))/sd(x, na.rm=T))^4))
}

isSmoker = bab1$smoke == 1
currKurt = kurt(bab1$bwt[isSmoker])

# simulate a normal distribution 1000 times
numSamples = sum(isSmoker, na.rm=T)
simDat = matrix(rnorm(1000*numSamples), ncol=1000)
simKurt = apply(simDat, 2, kurt)
hist(simKurt, xlab="Kurtosis value", ylab="Number of samples", xlim=c(2,4), main="")

Babies 1: the normal curve

x=seq(-4,4,0.001)
y = exp(-(x^2)/2)/sqrt(2*pi)
plot(x=x, y=y, type="l", main="The normal curve: 68% of the area colored green")

# 68 % within 1 SD should be filled with light green
isWithin1sd = x>-1 & x<1
points(x=x[isWithin1sd], y=y[isWithin1sd], type="h", col="light green")

Babies 1: fig 1.6

# more margin on the sides
> par("mar"=c(5,10,4,10))
> boxplot(bab2$weight[!isUnknown], col="gray80", medcol="white", staplewex=1, staplelwd = 2, outwex=0.9, outlty=1, outpch=NA, ylim=c(50,300), ylab="Weight (pounds)", axes=F)
> axis(2)

Babies 1: fig 1.5

# Histogram of infant birth weight for 1236 babies in the CHDS subset
h = hist(bab1$bwt, plot=F)
h$counts = 100*h$density
plot(h, main="", col="gray20", border="white", xlab="Weight (ounces)", ylab="Percent per ounce", ylim=c(0,2.5))

Babies 1: fig 1.4

isSmoker = bab3$smoke == 1
isNonSmoker = bab3$smoke == 0
num = table(bab3$number[isSmoker])[1:7] # table of number of cigarettes smoked per day
num = round(num*100/sum(num))
num[6] = 5 # to make the sum 100

mid.num = c(2.5, 7.5, 12.5, 17.5, 25.0, 35.0, 50.0)
num.count = rep(mid.num, num)
brk = c(0,5,10,15,20,30,40,60)
h = hist(num.count, breaks=brk,plot=F)
h$density = 100*h$density
plot(h, col="gray20", border="white", xlab="Number of cigarettes per day", ylab="Percent per cigarette", main="")

Babies 1: figure 1.3

# code for drawing the histogram of mother's height for 1214 mothers in the CHDS subset
ht = bab2$height
ht = ht[ht != 99] # 99 means missing value
h=hist(ht, breaks=seq(50,75,1), plot=F)
h$counts = 100*h$density
plot(h, xlab="Height (inches)", ylab="Percent per inch", border="white", col="gray20", main="")

Babies: load data

#########################################
# Load data for maternal smoking and infant health #
#########################################

bab1 = read.table(file="http://www.stat.berkeley.edu/users/statlabs/data/babiesI.data", header=T)

# convert weight from ounce to gram [0.035 ounce = 1gm]
bab1$bwtgm = bab1$bwt/0.035

#########################################
# Load babies 2 data #
#########################################

bab2 = read.table(file="http://www.stat.berkeley.edu/users/statlabs/data/babies.data"
, header=T)

#########################################
# Load babies 3 data #
#########################################

bab3 = read.table(file="http://www.stat.berkeley.edu/users/statlabs/data/babies23.data"
, header=T)

DNA pattern: sliding window

######################
# Sliding window approach for finding the largest cluster
# interval size 1000 bp
# overlap of 500 bp
######################

winLen = 1000
overlap = 500
begins = seq(from=1, to=dnalen-winLen+1, by=overlap) # for storing the palindrome counts
numPals = rep(NA, length(begins))
for(i in 1:length(begins)) {
numPals[i]=sum(begins[i]:(begins[i]+winLen-1) %in% posvec)
cat(i, "\t", begins[i], "\n")
}
plot(numPals, type="l", xlab="Position of window (base pairs)", ylab="Palindrome count")

DNA pattern: biggest cluster

intervalSizes = seq(from=500, to=12000, by=500)
possibleRepOrigin = data.frame(intervalSize = intervalSizes, begin=rep(NA, length(intervalSizes)), end=rep(NA, length(intervalSizes)), unitClustSize= rep(NA, length(intervalSizes)))

for(i in 1:length(intervalSizes)) {
size = intervalSizes[i]
intervalHead = seq(from=1, to=dnalen, by=size)
if((intervalHead[length(intervalHead)]+size-1)>dnalen) {
numIntervals = length(intervalHead)-1
} else {
numIntervals = length(intervalHead)
}
palCountsEachInterval = rep(NA, numIntervals)
for(intv in 1:numIntervals) {
palCountsEachInterval[intv] = sum(intervalHead[intv]:(intervalHead[intv]+size-1) %in% posvec)
}
possibleRepOrigin$unitClustSize[i] = max(palCountsEachInterval)/size
possibleRepOrigin$begin[i] = intervalHead[which(palCountsEachInterval==max(palCountsEachInterval))[1]] # in case of a tie
if((possibleRepOrigin$begin[i]+size-1)>dnalen) {
possibleRepOrigin$end[i] = dnalen
} else {
possibleRepOrigin$end[i] = possibleRepOrigin$begin[i]+size
}
}
print(possibleRepOrigin)

DNA pattern: distribution of counts

intervalSizes = seq(from=1000, to=12000, by=1000)
par(mfrow=c(4,3))
for(i in 1:length(intervalSizes)) {
size = intervalSizes[i]
intervalHead = seq(from=1, to=dnalen, by=size)
if((intervalHead[length(intervalHead)]+size-1)>dnalen) {
numIntervals = length(intervalHead)-1
} else {
numIntervals = length(intervalHead)
}
palCountsEachInterval = rep(NA, numSegments)
for(intv in 1:numIntervals) {
palCountsEachInterval[intv] = sum(intervalHead[intv]:(intervalHead[intv]+size-1) %in% posvec)
}
if(i %in% seq(1,12,4)) {
ylab = "Frequency"
} else {
ylab = ""
}
if(i %in% 9:12) {
xlab = "Counts per interval"
} else {
xlab = ""
}

hist(palCountsEachInterval, main=paste("interval size ", size, sep=""), xlab=xlab, ylab=ylab)
}

DNA pattern: summing up consecutive locations

sums2 = posvec[2:length(posvec)]+posvec[1:(length(posvec)-1)]
sums3 = posvec[3:length(posvec)]+posvec[2:(length(posvec)-1)]+posvec[1:(length(posvec)-2)]
sums4 = posvec[4:length(posvec)]+posvec[3:(length(posvec)-1)]+posvec[2:(length(posvec)-2)]+posvec[1:(length(posvec)-3)]
sums5 = posvec[5:length(posvec)]+posvec[4:(length(posvec)-1)]+posvec[3:(length(posvec)-2)]+posvec[2:(length(posvec)-3)]+posvec[1:(length(posvec)-4)]
par(mfrow=c(2,2))
hist(sums2, main="Histogram of sums: 2 consecutive palindromes", xlab="")
hist(sums3, main="Histogram of sums: 3 consecutive palindromes", xlab="")
hist(sums4, main="Histogram of sums: 4 consecutive palindromes", xlab="")
hist(sums5, main="Histogram of sums: 5 consecutive palindromes", xlab="")

DNA pattern: figure 4.1

size = 60000
nr = (dnalen %/% size) + as.numeric((dnalen %% size)>0) # number of horiz lines, accord to _size_
plot(x=c(1,size), y=c(nr,nr), type="l", xlab="", ylab="", ylim=c(0,nr), axes=F, col=par()$bg)
axis(1, at=seq(0,60000,10000), labels=paste(seq(0,60,10),"K", sep=""), tcl=0.5)

for(curr_row in 1:nr) {
# the horizontal line
segsize=size
begin = seq(from = 1, to=dnalen, by=size)[curr_row]
if((begin+size)>dnalen) {
segsize = dnalen-begin+1
}
lines(x=c(1,segsize), y=c(nr-curr_row+1, nr-curr_row+1), type="l")

# the vertical "tick"s for location of palindrome
pos.begin = (curr_row-1)*size+1 # beginning of the row
pos.end = pos.begin+segsize-1 # end of the row
pal.loc = posvec[posvec>pos.begin & posvec <= pos.end] # locations
pal.loc = pal.loc - size*(curr_row-1) # shift locations

for(i in 1:length(pal.loc)) {
lines(x=c(pal.loc[i],pal.loc[i]), y=c(nr-curr_row+1-0.1,nr-curr_row+1+0.1))
}
}

DNA pattern: load data

# read the data: CMV palindrome locations, needs network access
cmv=read.table(file="http://www.stat.berkeley.edu/users/statlabs/data/hcmv.data", header=T)

# convert the data to a vector
posvec = cmv[,1]

# length of CMV DNA
dnalen = 229354

R code

About Me

Saturday, March 14, 2009