R code: June 2006

Babies 1: qqplot with marginal histograms

# A function for drawing QQ-plot of 2 distributions along with marginal histograms

qq.hist = function(x=rnorm(1000), y=rnorm(1000), xlab="X", ylab="Y")
{

# calculate quantiles
xq = quantile(x, probs = seq(0,1,0.01))
yq = quantile(y, probs = seq(0,1,0.01))

# calculate histogram values
minval = floor(min(c(x,y)))-1
maxval = ceiling(max(c(x,y)))+1
xhist = hist(x, breaks=seq(minval,maxval,0.5), plot=FALSE)
yhist = hist(y, breaks=seq(minval,maxval,0.5), plot=FALSE)

top <- max(c(xhist$counts, yhist$counts))
xrange <- c(minval,maxval)
yrange <- c(minval,maxval)

# layout
nf <- layout(matrix(c(2,4,1,3),2,2,byrow=TRUE), c(3,1), c(1,3), TRUE)
layout.show(nf)

# draw the qq-plot
par(mar=c(4,4,1,1))
qqplot(x, y, xlim=xrange, ylim=yrange, pch=".", cex=2, main="", xlab = xlab, ylab=ylab, cex.lab=1.5)
abline(h=0,v=0,col="blue")

# histogram for the x-axis
par(mar=c(0,3,1,1))
barplot(xhist$counts, axes=F, ylim=c(0, top), space=0)

# histogram for the y-axis
par(mar=c(3,0,1,1))
barplot(yhist$counts, axes=F, xlim=c(0, top), space=0, horiz=TRUE)

}

Babies 1: fig 1.8

# A function for calculating coefficient of kurtosis
kurt = function(x)
{
return(mean(((x-mean(x, na.rm=T))/sd(x, na.rm=T))^4))
}

isSmoker = bab1$smoke == 1
currKurt = kurt(bab1$bwt[isSmoker])

# simulate a normal distribution 1000 times
numSamples = sum(isSmoker, na.rm=T)
simDat = matrix(rnorm(1000*numSamples), ncol=1000)
simKurt = apply(simDat, 2, kurt)
hist(simKurt, xlab="Kurtosis value", ylab="Number of samples", xlim=c(2,4), main="")

Babies 1: the normal curve

x=seq(-4,4,0.001)
y = exp(-(x^2)/2)/sqrt(2*pi)
plot(x=x, y=y, type="l", main="The normal curve: 68% of the area colored green")

# 68 % within 1 SD should be filled with light green
isWithin1sd = x>-1 & x<1
points(x=x[isWithin1sd], y=y[isWithin1sd], type="h", col="light green")

Babies 1: fig 1.6

# more margin on the sides
> par("mar"=c(5,10,4,10))
> boxplot(bab2$weight[!isUnknown], col="gray80", medcol="white", staplewex=1, staplelwd = 2, outwex=0.9, outlty=1, outpch=NA, ylim=c(50,300), ylab="Weight (pounds)", axes=F)
> axis(2)

Babies 1: fig 1.5

# Histogram of infant birth weight for 1236 babies in the CHDS subset
h = hist(bab1$bwt, plot=F)
h$counts = 100*h$density
plot(h, main="", col="gray20", border="white", xlab="Weight (ounces)", ylab="Percent per ounce", ylim=c(0,2.5))

Babies 1: fig 1.4

isSmoker = bab3$smoke == 1
isNonSmoker = bab3$smoke == 0
num = table(bab3$number[isSmoker])[1:7] # table of number of cigarettes smoked per day
num = round(num*100/sum(num))
num[6] = 5 # to make the sum 100

mid.num = c(2.5, 7.5, 12.5, 17.5, 25.0, 35.0, 50.0)
num.count = rep(mid.num, num)
brk = c(0,5,10,15,20,30,40,60)
h = hist(num.count, breaks=brk,plot=F)
h$density = 100*h$density
plot(h, col="gray20", border="white", xlab="Number of cigarettes per day", ylab="Percent per cigarette", main="")

Babies 1: figure 1.3

# code for drawing the histogram of mother's height for 1214 mothers in the CHDS subset
ht = bab2$height
ht = ht[ht != 99] # 99 means missing value
h=hist(ht, breaks=seq(50,75,1), plot=F)
h$counts = 100*h$density
plot(h, xlab="Height (inches)", ylab="Percent per inch", border="white", col="gray20", main="")

Babies: load data

#########################################
# Load data for maternal smoking and infant health #
#########################################

bab1 = read.table(file="http://www.stat.berkeley.edu/users/statlabs/data/babiesI.data", header=T)

# convert weight from ounce to gram [0.035 ounce = 1gm]
bab1$bwtgm = bab1$bwt/0.035

#########################################
# Load babies 2 data #
#########################################

bab2 = read.table(file="http://www.stat.berkeley.edu/users/statlabs/data/babies.data"
, header=T)

#########################################
# Load babies 3 data #
#########################################

bab3 = read.table(file="http://www.stat.berkeley.edu/users/statlabs/data/babies23.data"
, header=T)

DNA pattern: sliding window

######################
# Sliding window approach for finding the largest cluster
# interval size 1000 bp
# overlap of 500 bp
######################

winLen = 1000
overlap = 500
begins = seq(from=1, to=dnalen-winLen+1, by=overlap) # for storing the palindrome counts
numPals = rep(NA, length(begins))
for(i in 1:length(begins)) {
numPals[i]=sum(begins[i]:(begins[i]+winLen-1) %in% posvec)
cat(i, "\t", begins[i], "\n")
}
plot(numPals, type="l", xlab="Position of window (base pairs)", ylab="Palindrome count")

DNA pattern: biggest cluster

intervalSizes = seq(from=500, to=12000, by=500)
possibleRepOrigin = data.frame(intervalSize = intervalSizes, begin=rep(NA, length(intervalSizes)), end=rep(NA, length(intervalSizes)), unitClustSize= rep(NA, length(intervalSizes)))

for(i in 1:length(intervalSizes)) {
size = intervalSizes[i]
intervalHead = seq(from=1, to=dnalen, by=size)
if((intervalHead[length(intervalHead)]+size-1)>dnalen) {
numIntervals = length(intervalHead)-1
} else {
numIntervals = length(intervalHead)
}
palCountsEachInterval = rep(NA, numIntervals)
for(intv in 1:numIntervals) {
palCountsEachInterval[intv] = sum(intervalHead[intv]:(intervalHead[intv]+size-1) %in% posvec)
}
possibleRepOrigin$unitClustSize[i] = max(palCountsEachInterval)/size
possibleRepOrigin$begin[i] = intervalHead[which(palCountsEachInterval==max(palCountsEachInterval))[1]] # in case of a tie
if((possibleRepOrigin$begin[i]+size-1)>dnalen) {
possibleRepOrigin$end[i] = dnalen
} else {
possibleRepOrigin$end[i] = possibleRepOrigin$begin[i]+size
}
}
print(possibleRepOrigin)

DNA pattern: distribution of counts

intervalSizes = seq(from=1000, to=12000, by=1000)
par(mfrow=c(4,3))
for(i in 1:length(intervalSizes)) {
size = intervalSizes[i]
intervalHead = seq(from=1, to=dnalen, by=size)
if((intervalHead[length(intervalHead)]+size-1)>dnalen) {
numIntervals = length(intervalHead)-1
} else {
numIntervals = length(intervalHead)
}
palCountsEachInterval = rep(NA, numSegments)
for(intv in 1:numIntervals) {
palCountsEachInterval[intv] = sum(intervalHead[intv]:(intervalHead[intv]+size-1) %in% posvec)
}
if(i %in% seq(1,12,4)) {
ylab = "Frequency"
} else {
ylab = ""
}
if(i %in% 9:12) {
xlab = "Counts per interval"
} else {
xlab = ""
}

hist(palCountsEachInterval, main=paste("interval size ", size, sep=""), xlab=xlab, ylab=ylab)
}

DNA pattern: summing up consecutive locations

sums2 = posvec[2:length(posvec)]+posvec[1:(length(posvec)-1)]
sums3 = posvec[3:length(posvec)]+posvec[2:(length(posvec)-1)]+posvec[1:(length(posvec)-2)]
sums4 = posvec[4:length(posvec)]+posvec[3:(length(posvec)-1)]+posvec[2:(length(posvec)-2)]+posvec[1:(length(posvec)-3)]
sums5 = posvec[5:length(posvec)]+posvec[4:(length(posvec)-1)]+posvec[3:(length(posvec)-2)]+posvec[2:(length(posvec)-3)]+posvec[1:(length(posvec)-4)]
par(mfrow=c(2,2))
hist(sums2, main="Histogram of sums: 2 consecutive palindromes", xlab="")
hist(sums3, main="Histogram of sums: 3 consecutive palindromes", xlab="")
hist(sums4, main="Histogram of sums: 4 consecutive palindromes", xlab="")
hist(sums5, main="Histogram of sums: 5 consecutive palindromes", xlab="")

DNA pattern: figure 4.1

size = 60000
nr = (dnalen %/% size) + as.numeric((dnalen %% size)>0) # number of horiz lines, accord to _size_
plot(x=c(1,size), y=c(nr,nr), type="l", xlab="", ylab="", ylim=c(0,nr), axes=F, col=par()$bg)
axis(1, at=seq(0,60000,10000), labels=paste(seq(0,60,10),"K", sep=""), tcl=0.5)

for(curr_row in 1:nr) {
# the horizontal line
segsize=size
begin = seq(from = 1, to=dnalen, by=size)[curr_row]
if((begin+size)>dnalen) {
segsize = dnalen-begin+1
}
lines(x=c(1,segsize), y=c(nr-curr_row+1, nr-curr_row+1), type="l")

# the vertical "tick"s for location of palindrome
pos.begin = (curr_row-1)*size+1 # beginning of the row
pos.end = pos.begin+segsize-1 # end of the row
pal.loc = posvec[posvec>pos.begin & posvec <= pos.end] # locations
pal.loc = pal.loc - size*(curr_row-1) # shift locations

for(i in 1:length(pal.loc)) {
lines(x=c(pal.loc[i],pal.loc[i]), y=c(nr-curr_row+1-0.1,nr-curr_row+1+0.1))
}
}

DNA pattern: load data

# read the data: CMV palindrome locations, needs network access
cmv=read.table(file="http://www.stat.berkeley.edu/users/statlabs/data/hcmv.data", header=T)

# convert the data to a vector
posvec = cmv[,1]

# length of CMV DNA
dnalen = 229354

R code

About Me

Friday, June 30, 2006