RNA-Seq workshop


Link to GitHub repository:

https://github.com/itmat/Normalization


0. Set up

ssh username@demohpc.pmacs.upenn.edu

mkdir $HOME/WORKSHOP
cd $HOME/WORKSHOP

mkdir -p $HOME/WORKSHOP/reads/sample1
mkdir -p $HOME/WORKSHOP/reads/sample2
mkdir -p $HOME/WORKSHOP/reads/sample3
mkdir -p $HOME/WORKSHOP/reads/sample4

cp /opt/rna_seq/data/sample1_*.fq $HOME/WORKSHOP/reads/sample1/ 
cp /opt/rna_seq/data/sample2_*.fq $HOME/WORKSHOP/reads/sample2/ 
cp /opt/rna_seq/data/sample3_*.fq $HOME/WORKSHOP/reads/sample3/ 
cp /opt/rna_seq/data/sample4_*.fq $HOME/WORKSHOP/reads/sample4/ 

1. STAR

cd $HOME/WORKSHOP/

Create a text file listing the names of the sample directories.

cat > $HOME/WORKSHOP/sample_dirs.txt

Paste the following.

sample1
sample2
sample3
sample4

CTRL-D

Align the data with STAR

bsub -Is bash
module load STAR-2.3.0e  

perl /opt/rna_seq/scripts/runstar_workshop.pl $HOME/WORKSHOP/sample_dirs.txt $HOME/WORKSHOP/reads/ /opt/rna_seq/data/star_chr1and2andM/

bjobs -w
ls -ltr $HOME/WORKSHOP/logs/
more $HOME/WORKSHOP/shell_scripts/sample1.runstar.sh

2. PORT

cd $HOME/WORKSHOP
tree
ls $HOME/WORKSHOP/reads/*/*.fq > $HOME/WORKSHOP/unaligned.txt
more /opt/rna_seq/scripts/workshop.cfg

[PART1]

run_normalization --sample_dirs $HOME/WORKSHOP/sample_dirs.txt --loc $HOME/WORKSHOP/reads/ --unaligned $HOME/WORKSHOP/unaligned.txt --samfilename Aligned.out.sam --cfg /opt/rna_seq/scripts/workshop.cfg -fq -depthExon 3 -depthIntron 3

bjobs -w 
ls -ltr $HOME/WORKSHOP/logs/ 
tail -f $HOME/WORKSHOP/logs/WORKSHOP.run_normalization.log 
less -S $HOME/WORKSHOP/STATS/GENE/expected_num_reads_gnorm.txt
less -S $HOME/WORKSHOP/STATS/EXON_INTRON_JUNCTION/expected_num_reads.txt
more $HOME/WORKSHOP/STATS/*/percent_high_expresser_*.txt

[PART2]

run_normalization --sample_dirs $HOME/WORKSHOP/sample_dirs.txt --loc $HOME/WORKSHOP/reads/ --unaligned $HOME/WORKSHOP/unaligned.txt --samfilename Aligned.out.sam --cfg  /opt/rna_seq/scripts/workshop.cfg -fq -depthExon 3 -depthIntron 3 -part2 -cutoff_highexp 5

bjobs -w 
ls -ltr $HOME/WORKSHOP/logs/ 
tail -f $HOME/WORKSHOP/logs/WORKSHOP.run_normalization.log 
cd $HOME/WORKSHOP/ 
tree -d

3. Data Visualization

UCSC Genome Browser

Gene:

http://itmat.data.s3.amazonaws.com/workshop4/sample1.gene.norm_junctions_hq.bed
http://itmat.data.s3.amazonaws.com/workshop4/sample1.gene.norm.Unique.cov.gz
http://itmat.data.s3.amazonaws.com/workshop4/sample2.gene.norm_junctions_hq.bed
http://itmat.data.s3.amazonaws.com/workshop4/sample2.gene.norm.Unique.cov.gz
http://itmat.data.s3.amazonaws.com/workshop4/sample3.gene.norm_junctions_hq.bed
http://itmat.data.s3.amazonaws.com/workshop4/sample3.gene.norm.Unique.cov.gz
http://itmat.data.s3.amazonaws.com/workshop4/sample4.gene.norm_junctions_hq.bed
http://itmat.data.s3.amazonaws.com/workshop4/sample4.gene.norm.Unique.cov.gz

Exon-Intron-Junction:

http://itmat.data.s3.amazonaws.com/workshop4/sample1.merged_junctions_hq.bed
http://itmat.data.s3.amazonaws.com/workshop4/sample1.Unique.cov.gz
http://itmat.data.s3.amazonaws.com/workshop4/sample2.merged_junctions_hq.bed
http://itmat.data.s3.amazonaws.com/workshop4/sample2.Unique.cov.gz
http://itmat.data.s3.amazonaws.com/workshop4/sample3.merged_junctions_hq.bed
http://itmat.data.s3.amazonaws.com/workshop4/sample3.Unique.cov.gz
http://itmat.data.s3.amazonaws.com/workshop4/sample4.merged_junctions_hq.bed
http://itmat.data.s3.amazonaws.com/workshop4/sample4.Unique.cov.gz

UCSC Track Hubs - Exon-Intron-Junction - all samples

UCSC Track Hubs - Exon-Intron-Junction - by condition

chr1:193,597,181-193,610,390


4. Differential Expression Analysis

bsub -Is bash
cd $HOME
mkdir $HOME/differential_expression
cd $HOME/differential_expression
module load R-3.1.1
R

In R:

Parse csv file:

d = read.csv("/opt/rna_seq/data/control_vs_treatment.csv",header=T,row.names=1)
head(d)

Plot Correlation Distance:

png("corr_dist.png")
dd <- as.dist((1 - cor(d))/2)
plot(hclust(dd), main=sprintf("sum of dist. %f, Correlation Distance", sum(dd)),ylim =   400)
dev.off()
Wilcoxon rank sum test (Mann-Whitney)
data=d
p_val = as.data.frame(c(rep(NA,dim(data)[1])))
colnames(p_val) = "P_VAL"

for (i in 1:dim(data)[1]) {
  p_val_cur <- tryCatch(
    wilcox.test(as.matrix(data[i,1:4]),as.matrix(data[i,5:8]),alternative="two.sided",exact=TRUE)$p.value,     error=function(x) 1 )
  p_val$P_VAL[i]=p_val_cur
}
Benjamini & Hochberg (1995) p-value correction
FDR <- as.data.frame(p.adjust(p_val$P_VAL, method="BH"))
colnames(FDR) = "FDR"
out_table = cbind(data,p_val,FDR)

Sort and write results to file:

out_table_sorted = out_table[order(out_table$FDR),] 
write.csv(out_table_sorted, file="wilcox_BH_treatment_vs_ctrl.csv")
SAMseq

Load library for SAMseq:

library(samr)

Prepare sample info:

control = d[,c(1:4)]
treatment = d[,c(5:8)]
x = cbind(control,treatment)
y = c(rep(1, 4), rep(2, 4))

Run SAMseq:

samfit <- SAMseq(x, y, resp.type = "Two class unpaired", fdr.output=1,geneid = row.names(x))
head(samfit$siggenes.table$genes.up)

Format results:

direction = as.data.frame(c(rep("up",dim(samfit$siggenes.table$genes.up)[1])))
colnames(direction) = "direction"
samfit$siggenes.table$genes.up = cbind(samfit$siggenes.table$genes.up,direction)

direction = as.data.frame(c(rep("lo",dim(samfit$siggenes.table$genes.lo)[1])))
colnames(direction) = "direction"
samfit$siggenes.table$genes.lo = cbind(samfit$siggenes.table$genes.lo,direction)
ss <- rbind(samfit$siggenes.table$genes.up, samfit$siggenes.table$genes.lo)
k = ss[,3:6]
rownames(k) = ss[,2]
l = merge(k,d[,c(1:8)],by="row.names",all.x = T)
rownames(l) = l$Row.names
l = l[,-1]

Sort and write results to file:

l = l[order(l$"q-value(%)"),]
write.csv(l, file="samSeq_treatment_vs_ctrl.csv")   

View the RAW markdown.