Loading the data/libraries

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.0.5

library(reshape)
library(MAUDE)
maudeGitPathRoot = "https://raw.githubusercontent.com/de-Boer-Lab/MAUDE/master"

Load sample metadata.

allSamples = read.table(file=sprintf("%s/vignettes/BACH2_data/sample_metadata.txt",maudeGitPathRoot), sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F)
head(allSamples)

##                                                                                  directory
## 1           CRISPResso_BACH2/CRISPResso_on_BACH2-A1-A_S1.trimmed.1_BACH2-A1-A_S1.trimmed.2
## 2           CRISPResso_BACH2/CRISPResso_on_BACH2-A1-B_S2.trimmed.1_BACH2-A1-B_S2.trimmed.2
## 3           CRISPResso_BACH2/CRISPResso_on_BACH2-A1-C_S3.trimmed.1_BACH2-A1-C_S3.trimmed.2
## 4           CRISPResso_BACH2/CRISPResso_on_BACH2-A1-D_S4.trimmed.1_BACH2-A1-D_S4.trimmed.2
## 5 CRISPResso_BACH2/CRISPResso_on_BACH2-A1-HEK-A_S49.trimmed.1_BACH2-A1-HEK-A_S49.trimmed.2
## 6 CRISPResso_BACH2/CRISPResso_on_BACH2-A1-HEK-B_S50.trimmed.1_BACH2-A1-HEK-B_S50.trimmed.2
##   locus locus2 replicate sortBin
## 1 BACH2  BACH2        A1       A
## 2 BACH2  BACH2        A1       B
## 3 BACH2  BACH2        A1       C
## 4 BACH2  BACH2        A1       D
## 5 BACH2  BACH2    A1-HEK       A
## 6 BACH2  BACH2    A1-HEK       B

#remove some of the control samples
allSamples = allSamples[!grepl("HEK62", allSamples$replicate),]

Load the data.

if (FALSE){
  #this was run on my computer to load the data from many files spread out over many subdirectories. Rather than upload all these files separately, I have loaded them all locally and saved the resulting concatenated file onto github. I leave this code here so that others may view how the data was loaded, should they have their own CRISPResso files to analyze.
  inDir = "/Path/To/CRISPResso/Files";
  setwd(inDir)
  
  
  allCRISPRessoData= data.frame()
  for (i in 1:nrow(allSamples)){
    curData = read.table(file=sprintf("%s/%s/Alleles_frequency_table.txt",inDir, allSamples$directory[i]), sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F, comment.char = "")
    curData2 = read.table(file=sprintf("%s/Deep_resequencing_analysis/%s/Alleles_frequency_table.txt",inDir, allSamples$directory[i]), sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F, comment.char = "")
    curData = rbind(curData, curData2);
    curData$locus = allSamples$locus[i];
    curData$replicate = allSamples$replicate[i];
    curData$sortBin = allSamples$sortBin[i];
    allCRISPRessoData = rbind(allCRISPRessoData, curData)
  }
  #remove gaps from sequences
  allCRISPRessoData$SeqSpecies = gsub("-","",allCRISPRessoData$Aligned_Sequence); # Allele
  #remove unwanted fields
  allCRISPRessoData$Aligned_Sequence=NULL; allCRISPRessoData$Reference_Sequence=NULL;  allCRISPRessoData$X.Reads.1=NULL;
  write.table(allCRISPRessoData, file=sprintf("%s/loaded_BACH2_BE_CRISPResso_data.txt", inDir),row.names = F, col.names = T, quote=F, sep="\t")
  #I then gzipped this file and uploaded it to github
}else{
  #load the CRISPResso files from GitHub. 
  z= gzcon(url(sprintf("%s/vignettes/BACH2_data/loaded_BACH2_BE_CRISPResso_data.txt.gz",maudeGitPathRoot)));
  fileConn=textConnection(readLines(z));
  allCRISPRessoData =  read.table(fileConn, sep="\t", quote="", header = T, row.names = NULL, stringsAsFactors = F)
  close(fileConn)
}

Polish CRISPResso genotype data, calculate reads per sample and genotype composition per sample.

#CRISPResso has split some sequences into two or maybe more lines; below is an example
#We also input multiple files from different sequencing runs. This merges all identical seq species'/samples
allCRISPRessoData = cast(melt(allCRISPRessoData, id.vars = c("SeqSpecies","Reference_Name","Read_Status","n_deleted","n_inserted","n_mutated", "locus","replicate","sortBin")), SeqSpecies + Reference_Name + Read_Status + n_deleted + n_inserted + n_mutated + locus + replicate + sortBin ~ variable, value="value", fun.aggregate=sum)

#Get read totals per replicate
allCRISPRessoDataTotals = cast(allCRISPRessoData, locus + replicate + sortBin ~ ., value="X.Reads", fun.aggregate = sum)
names(allCRISPRessoDataTotals)[ncol(allCRISPRessoDataTotals)] = "totalReads";

#Add totalReads column to allCRISPRessoData, calculate read fractions
allCRISPRessoData = merge(allCRISPRessoData, allCRISPRessoDataTotals, by=c("locus","replicate","sortBin"))
allCRISPRessoData$readFraction = allCRISPRessoData$X.Reads/allCRISPRessoData$totalReads;

Quality Control analysis for the experiment

Reads per sample

p = ggplot(allCRISPRessoDataTotals, aes(x=replicate, fill=sortBin, y=totalReads)) + geom_bar(stat="identity", position="dodge") + facet_grid(. ~ locus) + theme_classic() + scale_y_continuous(expand=c(0,0))+theme(axis.text.x=element_text(hjust=1, angle=90)); print(p)

Same on a log scale

p = ggplot(allCRISPRessoDataTotals, aes(x=replicate, fill=sortBin, y=totalReads)) + geom_bar(stat="identity", position="dodge") + facet_grid(. ~ locus) + theme_classic() + scale_y_log10(expand=c(0,0))+theme(axis.text.x=element_text(hjust=1, angle=90)); print(p)

Toss all the cases where the locus was unmodified but sequenced anyway. Note that this means each was effectively only 50K cells, not 100k because the gDNA was divided in half

allCRISPRessoData = allCRISPRessoData[ !(allCRISPRessoData$locus=="HEK" & grepl("^[ABCR][12]$", allCRISPRessoData$replicate)) & !(allCRISPRessoData$locus=="BACH2" & grepl("^HEK9[12]", allCRISPRessoData$replicate)),]

Plot a CDF showing the cumulative fraction of genotypes (y axis) per sample (colour) by their abundance in the sequencing data (x axis). This illustrates how biased the data are towards rare genotypes (left) vs abundant genotypes (right). The more the curves shift to the right, the less complex the modifications to the DNA. A lot of the ones to the left are from sequencing artifacts.

p = ggplot(allCRISPRessoData[allCRISPRessoData$sortBin=="NS",], aes(x= readFraction, colour=replicate)) + stat_ecdf()+facet_grid(locus ~ .) + scale_x_log10() + theme_bw(); print(p)

Estimate the %unmodified in each sample.

temp = cast(allCRISPRessoData[allCRISPRessoData$Read_Status=="UNMODIFIED" & allCRISPRessoData$Reference_Name=="Reference",], formula = sortBin + locus+ replicate ~. ,value = "readFraction", fun.aggregate = max)
names(temp)[ncol(temp)]="readFraction";
p = ggplot(temp, aes(x=sortBin, y=replicate, fill=readFraction)) + geom_tile() + facet_grid(locus ~., scales="free_y")+ggtitle("just unmodified read fractions") + scale_fill_gradientn(colours=c("red","orange", "green","cyan","blue","violet"), limits=c(0,0.8)); print(p)

min(temp$readFraction, na.rm = T) # 0.0356963

## [1] 0.0356963

Sample exclusion

allCRISPRessoData = allCRISPRessoData[allCRISPRessoData$replicate!="A1-HEK",] # this sample had no data for bin D (BACH2) because it didn't PCR well

allCRISPRessoData = allCRISPRessoData[allCRISPRessoData$replicate!="B2-HEK",] # this sample had very low coverage for bin D for BACH2

Identify and retain only those alleles that are present in all samples

seqsObserved = cast(allCRISPRessoData, SeqSpecies + locus + Read_Status + Reference_Name~ .)

## Using readFraction as value column.  Use the value argument to cast to override this choice

## Aggregation requires fun.aggregate: length used as default

names(seqsObserved)[ncol(seqsObserved)]="seqRunsObserved";

retainedSamples = unique(allCRISPRessoData[,c("locus","replicate","sortBin")])

for (l in unique(allSamples$locus)){
  seqsObserved$inAll[seqsObserved$locus == l] = seqsObserved$seqRunsObserved[seqsObserved$locus == l]==sum(retainedSamples$locus==l)
}

#require that all samples have at least one read for every species considered. This will help exclude read errors
keepAlleles = seqsObserved[seqsObserved$inAll,]

MAUDE Analysis of base editor screen

#First make a count matrix, but only of alleles seen in all replicates
readCountMat = cast(allCRISPRessoData[allCRISPRessoData$SeqSpecies %in% keepAlleles$SeqSpecies,], SeqSpecies  + replicate +locus ~ sortBin, value="X.Reads")
readCountMat[is.na(readCountMat)] = 0
readCountMat = readCountMat[order(readCountMat$NS, decreasing = T),]

#Label WT alleles
wtSeqs = data.frame(locus = c("HEK","BACH2"), SeqSpecies =  c("GGTAGCCAGAGACCCGCTGGTCTTCTTTCCCCTCCCCTGCCCTCCCCTCCCTTCAAGATGGCTGACAA","TGCCCCACCCTGTGCCCTTTTTACATTACACACAAATAGGGACGGATTTCCTGTAAGCTGATCTTGAAGAAAAAAAACATGTTAGACAAAGAAAATCAGAACTAAGA"), isWT=T, stringsAsFactors = F);
readCountMat = merge(readCountMat, wtSeqs, by=c("locus","SeqSpecies"), all.x=T)
readCountMat$isWT[is.na(readCountMat$isWT)]=F;

allCRISPRessoData = merge(allCRISPRessoData, wtSeqs, by=c("locus","SeqSpecies"), all.x=T)
allCRISPRessoData$isWT[is.na(allCRISPRessoData$isWT)]=F;


binBounds = makeBinModel(data.frame(Bin=c("A","B","C","D","E","F"), fraction=rep(0.105,6))) # 10.5% bins
# but only the top ~21% (CD) and bottom ~21% (AB) were retained
binBounds = binBounds[binBounds$Bin %in% c("A","B","E","F"),] 
binBounds$Bin[binBounds$Bin=="E"]="C"
binBounds$Bin[binBounds$Bin=="F"]="D"

The bin layout:

p = ggplot(binBounds, aes(colour=Bin))  + 
  geom_density(data=data.frame(x=rnorm(100000)), aes(x=x), fill="gray", colour=NA)+ 
  ggplot2::geom_segment(aes(x=binStartZ, xend=binEndZ, y=fraction, yend=fraction)) + 
  xlab("Bin bounds as expression Z-scores") + 
  ylab("Fraction of the distribution captured") +theme_classic()+scale_y_continuous(expand=c(0,0))+
  coord_cartesian(ylim=c(0,0.7)); print(p)

Replicate bin stats for each experiment (the same sorting parameters were used for each)

curExpts = unique(readCountMat[c("replicate","locus")])

binStatsAll = data.frame();
for(i in 1:nrow(curExpts)){
  binStatsAll = rbind(binStatsAll, cbind(binBounds, curExpts[i,]))
}

#perform MAUDE analysis at guide (allele in this case) level for each locus.
guideLevelStats = data.frame();
for( l in unique(allSamples$locus)){
  message(l);
  statsA = findGuideHitsAllScreens(experiments = curExpts[curExpts$locus==l,], countDataFrame = readCountMat[readCountMat$locus==l,], binStats = binStatsAll[binStatsAll$locus==l,], sortBins = c("A","B","C","D"), unsortedBin = "NS", negativeControl = "isWT")
  guideLevelStats = rbind(guideLevelStats, statsA)
}

## BACH2

## Mu optimization returned NaN objective: restricting search space

## New interval = c(-0.320000, 0.320000)

## Mu optimization returned NaN objective: restricting search space

## New interval = c(-0.328000, 0.328000)

## HEK

## Mu optimization returned NaN objective: restricting search space

## New interval = c(-0.232000, 0.232000)

## Mu optimization returned NaN objective: restricting search space

## New interval = c(-0.232000, 0.232000)

## Mu optimization returned NaN objective: restricting search space

## New interval = c(-0.224000, 0.224000)

## Mu optimization returned NaN objective: restricting search space

## New interval = c(-0.232000, 0.232000)

## Mu optimization returned NaN objective: restricting search space

## New interval = c(-0.216000, 0.216000)

## Mu optimization returned NaN objective: restricting search space

## New interval = c(-0.232000, 0.232000)

Summarize base changes per allele

baseChanges = data.frame()
for (l in wtSeqs$locus){
  wtSeq = wtSeqs$SeqSpecies[wtSeqs$locus==l];
  wtSplit = strsplit(wtSeq,"")[[1]]
  curSeqs = unique(guideLevelStats$SeqSpecies[guideLevelStats$locus==l & guideLevelStats$libFraction > 0.001])
  for (i in 1:length(curSeqs)){
    curSplit = strsplit(curSeqs[i],"")[[1]]
    mismatches = c();
    mismatchPoss=c();
    for (j in 1:min(length(wtSplit),length(curSplit))){
      if (wtSplit[j]!=curSplit[j]){
        mismatches = c(mismatches, curSplit[j]);
        mismatchPoss = c(mismatchPoss, j)
      }
    }
    if (length(mismatchPoss)>0){
      baseChanges = rbind(baseChanges, data.frame(mismatch=mismatches, position = mismatchPoss, SeqSpecies=curSeqs[i], locus=l))
    }
  }
}

baseChangesSummary = cast(baseChanges, SeqSpecies +locus~ ., value="mismatch")

## Aggregation requires fun.aggregate: length used as default

names(baseChangesSummary)[ncol(baseChangesSummary)] = "numMismatches";
p = ggplot(baseChangesSummary, aes(x=numMismatches, colour=locus)) + stat_ecdf()+ geom_vline(xintercept = 15); print(p)

#exclude any where the number of mismatches is greater than 15 - these are also likely read or PCR artifacts.
baseChangesSummary$keepAlleles= baseChangesSummary$numMismatches<15;

baseChanges$has = 1;

baseChangesNumWith = cast(baseChanges[baseChanges$SeqSpecies %in% baseChangesSummary$SeqSpecies[baseChangesSummary$keepAlleles],], position + mismatch + locus~ ., value="has", fun.aggregate = sum)
names(baseChangesNumWith)[ncol(baseChangesNumWith)] = "numSeqsWith"
baseChanges = merge(baseChanges, baseChangesNumWith, by=c("position","mismatch","locus"))
baseChanges$mismatch = as.character(baseChanges$mismatch)
baseChanges$SeqSpecies = as.character(baseChanges$SeqSpecies)


p = ggplot(baseChanges[baseChanges$SeqSpecies %in% baseChangesSummary$SeqSpecies[baseChangesSummary$keepAlleles] & !(baseChanges$SeqSpecies %in% baseChanges$SeqSpecies[baseChanges$numSeqsWith==1]),], aes(x=position, y=SeqSpecies, fill=mismatch)) + geom_tile()+scale_fill_manual(values=c("orange","green","blue","red"))+scale_x_continuous(expand=c(0,0))+theme_classic() + theme(axis.text.y = element_text(size=5, family = "Courier")) + facet_grid(locus ~ ., scales="free", space ="free"); print(p)

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

#A_53 is the mutation of interest

baseChangesMatrix = cast(baseChanges[baseChanges$locus == "BACH2" & baseChanges$SeqSpecies %in% baseChangesSummary$SeqSpecies[baseChangesSummary$keepAlleles] & !(baseChanges$SeqSpecies %in% baseChanges$SeqSpecies[baseChanges$numSeqsWith==1]),], SeqSpecies ~ mismatch + position, value="has", fill = 0)
A53_seq = baseChangesMatrix$SeqSpecies[apply(baseChangesMatrix[2:ncol(baseChangesMatrix)],1, sum)==1 & baseChangesMatrix$A_53==1]

guideLevelStats$pooled = grepl("-",guideLevelStats$replicate);

wtAndVarMaudeMu = cast(guideLevelStats[guideLevelStats$SeqSpecies %in% c(A53_seq, wtSeqs$SeqSpecies[wtSeqs$locus=="BACH2"]),], replicate +pooled ~ SeqSpecies, value="mean")
names(wtAndVarMaudeMu)[names(wtAndVarMaudeMu)==A53_seq]="rs72928038";
names(wtAndVarMaudeMu)[names(wtAndVarMaudeMu)==wtSeqs$SeqSpecies[wtSeqs$locus=="BACH2"]]="WT";

ttestResults = t.test(x = wtAndVarMaudeMu$rs72928038[!wtAndVarMaudeMu$pooled], y=wtAndVarMaudeMu$WT[!wtAndVarMaudeMu$pooled], alternative="less", paired=TRUE, var.equal = FALSE)

ttestResults_mixedCells = t.test(x = wtAndVarMaudeMu$rs72928038[wtAndVarMaudeMu$pooled], y=wtAndVarMaudeMu$WT[wtAndVarMaudeMu$pooled], alternative="less", paired=TRUE, var.equal = FALSE)

wtAndVarMaudeMu$replicate2 = gsub("-HEK","",wtAndVarMaudeMu$replicate)

Make plot of WT vs rs72928038 mean expression levels

meltedSNPMus = melt(as.data.frame(wtAndVarMaudeMu), id.vars=c("pooled","replicate", "replicate2"));
meltedSNPMus$genotype = factor(ifelse(meltedSNPMus$variable=="WT", "G", "A (risk)"),levels=c("G","A (risk)"))
p = ggplot(meltedSNPMus, aes(x=genotype, y=value, group=replicate)) + geom_point() + geom_line()+facet_grid(. ~ pooled)+theme_bw()+xlab("Genotype") + ylab("Mean expression") + ggtitle(sprintf("P=%f; P=%f", ttestResults$p.value, ttestResults_mixedCells$p.value)); print(p)

BACH2 Base editor flow-FISH screen

Carl de Boer

24/02/2022

Introduction

Loading the data/libraries

Quality Control analysis for the experiment

MAUDE Analysis of base editor screen