set up notebook

In [1]:
#!rm genomes/tb927_6/*
In [2]:
#reload when modified
%load_ext autoreload
%autoreload 2
#activate r magic
%load_ext rpy2.ipython
%matplotlib inline
In [13]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import utilities as UT
import missingno as msno
import random
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import gc

random.seed(1976)
np.random.seed(1976)

Data Anaylsis

Experiment SetUp

In [14]:
from IPython.display import Image
In [ ]:
 
In [15]:
#create a dictionary of gene to desc
#from the gff file
def make_desc(_GFF):
    gff =pd.read_csv( _GFF, sep='\t', header=None, comment='#')
    
    gff = gff[gff.iloc[:,2]=='gene']
    #print( gff[gff[gff.columns[-1]].str.contains('Tb427_020006200')] )
    desc = {}
    for n in gff.iloc[:,-1]:
        n=n.replace('%2C',' ')
        item_list = n.split(';')
        #print (item_list)
        temp_dict = {}
        for m in item_list:
            #print(m)
            temp_dict[m.split('=')[0].strip()]=m.split('=')[1].strip()
        #print(temp_dict['ID'])
        #print(temp_dict['description'])
        desc[temp_dict['ID']]=temp_dict.get('description','none')
    return desc

desc_dict = make_desc('genomes/tb927_3/tb927_3.gff')
desc_dict['Tb10.v4.0073']
Out[15]:
'variant surface glycoprotein (VSG  pseudogene)  putative'
In [16]:
exp = 'P{life_stage}{replica}'
list_df = [exp.format(
    life_stage=life_stage,
    replica=replica) 
 for life_stage in ['1','2','3']
 for replica in ['C','T']
            ]
list_df = [n+'/res/'+n+'/counts.txt' for n in list_df]
list_df =[pd.read_csv(n,index_col=[0],comment='#',sep='\t') for n in list_df]
df = list_df[0].copy()
for temp_df in list_df[1:]:
    df = df.join(temp_df.iloc[:,-1])
df.head()
#temp_df = pd.read_csv('BSF/tb927_3_ks_counts_final.txt',index_col=[0],comment='#',sep='\t')
Out[16]:
Chr Start End Strand Length /tmp/3092.1.all.q/P1C/P1Csorted.bam /tmp/3093.1.all.q/P1T/P1Tsorted.bam /tmp/3094.1.all.q/P2C/P2Csorted.bam /tmp/3095.1.all.q/P2T/P2Tsorted.bam /tmp/3096.1.all.q/P3C/P3Csorted.bam /tmp/3097.1.all.q/P3T/P3Tsorted.bam
Geneid
Tb10.v4.0073 tryp_X-188b09.p2kB601 929 1489 + 561 1 0 0 2 0 0
Tb10.v4.0074 tryp_X-188b09.p2kB601 2775 3452 + 678 5 5 3 6 7 5
Tb10.v4.0075 tryp_X-188b09.p2kB601 3781 5223 + 1443 0 1 4 0 0 0
Tb10.v4.0076 tryp_X-188b09.p2kB601 6264 7721 + 1458 1 0 1 0 0 2
Tb10.v4.0077 tryp_X-188b09.p2kB601 9669 10955 + 1287 0 0 0 0 0 0
In [17]:
#df = pd.read_csv('InData/tb927_3_ks_counts_final.txt',index_col=[0],comment='#',sep='\t')
#df.head()
#data_col = df.columns[6:25]
In [18]:
data_col = df.columns[5:]
data_col
Out[18]:
Index(['/tmp/3092.1.all.q/P1C/P1Csorted.bam',
       '/tmp/3093.1.all.q/P1T/P1Tsorted.bam',
       '/tmp/3094.1.all.q/P2C/P2Csorted.bam',
       '/tmp/3095.1.all.q/P2T/P2Tsorted.bam',
       '/tmp/3096.1.all.q/P3C/P3Csorted.bam',
       '/tmp/3097.1.all.q/P3T/P3Tsorted.bam'],
      dtype='object')
In [19]:
indata = df[data_col]
indata.columns = [n.split('/')[3] for  n in indata.columns]
indata = indata[['P1C','P2C','P3C','P1T','P2T','P3T']]
indata.head()
Out[19]:
P1C P2C P3C P1T P2T P3T
Geneid
Tb10.v4.0073 1 0 0 0 2 0
Tb10.v4.0074 5 3 7 5 6 5
Tb10.v4.0075 0 4 0 1 0 0
Tb10.v4.0076 1 1 0 0 0 2
Tb10.v4.0077 0 0 0 0 0 0
In [20]:
print(indata.shape)
indata=indata.dropna()
print(indata.shape)
#indata.loc['KS17gene_1749a']
#indata['desc']=[desc_dict.get(n,'none') for n in indata.index.values]
#indata.to_csv('indata.csv')
#indata.head()
#indata.loc['mainVSG-427-2']
(19949, 6)
(19949, 6)

Missing Data Viz

In [ ]:
 
In [22]:
msno.matrix(indata.replace(0,np.nan).dropna(how='all'),figsize=(4, 4))
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x2acb538faba8>
In [23]:
msno.bar(indata.replace(0,np.nan).dropna(how='all'),figsize=(4, 4))
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x2acb53b70048>

QC - Corr analysis

In [24]:
!mkdir -p Figures
In [27]:
fig,ax=plt.subplots(figsize=(6,6))
cbar_ax = fig.add_axes([.91, .6, .03, .2])
sns.heatmap(np.log2(indata).corr(),
            #vmin=-1,
            cmap='coolwarm',
            annot=True,linewidths=.5,ax=ax, cbar_ax = cbar_ax, cbar=True)
print(ax.get_ylim())
ax.set_ylim(6,0)
plt.savefig('Figures/Figure_2.png')
plt.show()
(5.5, 0.5)
In [ ]:
 

QC - MSD

In [28]:
plt.style.use('ggplot')
palette = ['r']*3+['b']*3
fig,ax = plt.subplots(figsize=(4,4), ncols=1, nrows=1)
UT.make_mds(np.log2(indata),palette,ax,color_dictionary={'r':'C','b':'T',
                                                        })
plt.savefig('Figures/Figure_3.png')
plt.show()
{'r': 'C', 'b': 'T'}

Compute Length and GC content

In [29]:
!mkdir -p InData
In [30]:
!gtf2bed < genomes/tb927_6/tb927_6.gtf > tb927_6.bed
!bedtools nuc -fi genomes/tb927_6/tb927_6.fa -bed tb927_6.bed >InData/GC_content_927.txt
In [31]:
#!bowtie2-build genomes/tb927_5/tb927_5.fa genomes/tb927_5/tb927_5
In [32]:
def get_gene_ids(n):
    res = {}
    temp = n.split(';')
    temp =[n.strip() for n in temp if len(n)>2]
    for f in temp:
        key = f.split(' ')[0]
        value = f.split(' ')[1]
        key=key.replace('\"','').replace('\'','').strip()
        value=value.replace('\"','').replace('\'','').strip()
        res[key]=value
    return res['gene_id']
In [33]:
gc_content = pd.read_csv('InData/GC_content_927.txt',sep='\t')
gc_content = gc_content[gc_content['8_usercol']=='transcript']
gc_content['gene_id'] = [get_gene_ids(n) for n in gc_content['10_usercol']]
gc_content = gc_content.drop_duplicates('gene_id')
gc_content.set_index('gene_id',inplace=True)
gc_content=gc_content[['19_seq_len','12_pct_gc']]
gc_content.columns = ['length', 'gccontent']
gc_content.head()
Out[33]:
length gccontent
gene_id
MSTRG.1 2768 0.505419
MSTRG.2 2482 0.304593
TRY.1 746 0.273458
TRY.2 1531 0.288047
TRY.3 1390 0.379856
In [34]:
indata.head()

#indata.astype(int).to_csv('indata.csv',index=True)
Out[34]:
P1C P2C P3C P1T P2T P3T
Geneid
Tb10.v4.0073 1 0 0 0 2 0
Tb10.v4.0074 5 3 7 5 6 5
Tb10.v4.0075 0 4 0 1 0 0
Tb10.v4.0076 1 1 0 0 0 2
Tb10.v4.0077 0 0 0 0 0 0
In [35]:
#metadata=pd.DataFrame()
#metadata['samples']=indata.columns
#metadata['treatment']=['C','C','C','D','D','D','DF','DF','DF','F','F','F']
#metadata['batch']=1
#metadata.reset_index().to_csv('metadata.csv')
In [36]:
print(indata.shape)
indata=indata.join(gc_content,how='inner')
gc_content = gc_content[['length', 'gccontent']]
indata.drop(['length', 'gccontent'],axis=1,inplace=True) 
(19949, 6)
In [ ]:
 
In [ ]:
 
In [37]:
#indata.loc['mainVSG-427-2']

edgeR to filter low counts

In [38]:
%%R -i indata
options(warn=-1)
library("limma") 
library("edgeR")
head(indata)
             P1C P2C P3C P1T P2T P3T
Tb10.v4.0073   1   0   0   0   2   0
Tb10.v4.0074   5   3   7   5   6   5
Tb10.v4.0075   0   4   0   1   0   0
Tb10.v4.0076   1   1   0   0   0   2
Tb10.v4.0077   0   0   0   0   0   0
Tb10.v4.0078 173  96 117 198 165 197
In [40]:
%%R
group <- factor(c(
    'C','C','C',
    'T','T','T'
))

y <- DGEList(counts=indata,group=group)
keep <- filterByExpr(y)
y <- y[keep,,keep.lib.sizes=FALSE]
counts = y$counts
genes = row.names(y)
In [41]:
%R -o counts,genes
indata = pd.DataFrame(counts,index=genes,columns=indata.columns)
indata.shape
Out[41]:
(14550, 6)
In [ ]:
 
In [42]:
indata=indata.join(gc_content,how='inner')
indata.shape
Out[42]:
(14550, 8)

GC / length content

In [43]:
gc_content = indata[['length', 'gccontent']]
indata.drop(['length', 'gccontent'],axis=1,inplace=True)
print(indata.shape,gc_content.shape)
indata.head()
(14550, 6) (14550, 2)
Out[43]:
P1C P2C P3C P1T P2T P3T
Tb10.v4.0078 173 96 117 198 165 197
Tb927.8.130 23 11 6 20 17 16
Tb927.8.150 292 177 191 200 251 203
Tb927.8.270 2682 1766 1951 2232 2079 1761
Tb927.8.320 68 25 29 50 42 51

size factors

In [44]:
sizeFactors=indata.sum()
sizeFactors = sizeFactors.values
sizeFactors
Out[44]:
array([95698845, 62105125, 66819012, 64952547, 66553752, 63170283])
In [45]:
#np.log2(gc_content['length']/1000).plot(kind='hist')

Bias Correction

In [47]:
%%R -i gc_content,indata,sizeFactors
library(cqn)
library(scales)
In [48]:
%%R
stopifnot(all(rownames(indata) == rownames(gc_content)))
cqn.subset <- cqn(indata, lengths = gc_content$length,
                  x = gc_content$gccontent, sizeFactors = sizeFactors,
                  verbose = TRUE)
RQ fit ......
SQN .
In [49]:
#%R cqn.subset

Viz Bias

In [50]:
%%R
cqnplot <- function(x, n = 1, col = "grey60", ylab="estimated bias effect", 
                    xlab = "", type = "l", lty = 1, ...) {
    if(class(x) != "cqn")
        stop("'x' needs to be of class 'cqn'")
    if(n == 1) {
        func <- x$func1
        grid <- x$grid1
        knots <- x$knots1
    }
    if(n == 2) {
        if(is.null(x$func2))
            stop("argument 'x' does not appear to have two smooth functions (component 'func2' is NULL)")
        func <- x$func2
        grid <- x$grid2
        knots <- x$knots2
    }
    
    
    #par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
    matplot(replicate(ncol(func), grid), func, ylab = ylab, xlab = xlab, type = type,
            col = col, lty = lty, ...)
    
    legend("bottomleft", legend = colnames(x$counts), inset=c(1,0),
           title="Samples", lty = lty, col = col)
    rug(knots, lwd = 2)
    invisible(x)
    
}


library(repr)
#options(repr.plot.width = 10, repr.plot.height = 0.75)
# Change plot size to 4 x 3
#options(repr.plot.width=4, repr.plot.height=3)


colors <- c(
    'red','red','red','red',
    'blue','blue','blue',
    'green','green','green',
    'yellow','yellow','yellow'
           )
lty =c(1,1,1,
       1,1,1,
      1,1,1,
      1,1,1)

#png("Figures/Figure_12.png")
#par(mfrow=c(1,2))
cqnplot(cqn.subset, col=colors,
        n = 1, xlab = "GC content", lty = lty,
        ylim = c(3, 5.5), 
        
       )
#dev.off()

#ggsave('plot.png', width=8.27, height= 11.69) #A4 size in inches
#dev.off()
In [51]:
%%R
library(repr)
#options(repr.plot.width = 12, repr.plot.height = 0.75)
# Change plot size to 4 x 3
#options(repr.plot.width=8, repr.plot.height=3)


colors <- c(
    'red','red','red','red',
    'blue','blue','blue',
    'green','green','green',
    'yellow','yellow','yellow'
           )
lty =c(1,1,1,
       1,1,1,
      1,1,1,
      1,1,1)

#par(mfrow=c(1,2))
#png("Figures/Figure_13.png")
cqnplot(cqn.subset, col=colors,
        n = 2, xlab = "length", lty = lty,
        ylim = c(0.2,8), 
    
       )
#dev.off()

Bias Correction

In [52]:
%%R
RPKM.cqn <- cqn.subset$y + cqn.subset$offset
out_table <- RPKM.cqn
head(out_table)
                   P1C       P2C       P3C       P1T       P2T        P3T
Tb10.v4.0078  3.255765  3.133420  3.205389  3.793779  3.538797  3.9559340
Tb927.8.130  -2.389388 -2.620853 -3.637160 -1.951168 -2.260972 -2.4996684
Tb927.8.150   1.213153  1.122199  1.163335  1.111491  1.541013  1.2409137
Tb927.8.270   4.171547  4.240597  4.236281  4.442233  4.332082  4.1916834
Tb927.8.320  -1.459074 -2.077858 -2.105363 -1.077946 -1.521346 -1.2633234
Tb927.8.350  -1.077946 -1.677259 -1.523089 -1.077946 -1.582936 -0.9310127
In [53]:
#out_table
In [54]:
%R -o out_table
out_table = pd.DataFrame(out_table,index=indata.index.values,columns=indata.columns)
out_table.head()
Out[54]:
P1C P2C P3C P1T P2T P3T
Tb10.v4.0078 3.255765 3.133420 3.205389 3.793779 3.538797 3.955934
Tb927.8.130 -2.389388 -2.620853 -3.637160 -1.951168 -2.260972 -2.499668
Tb927.8.150 1.213153 1.122199 1.163335 1.111491 1.541013 1.240914
Tb927.8.270 4.171547 4.240597 4.236281 4.442233 4.332082 4.191683
Tb927.8.320 -1.459074 -2.077858 -2.105363 -1.077946 -1.521346 -1.263323

Visualise Normalized Counts

In [55]:
fig,axes=plt.subplots(figsize=(16,4),ncols=2)
ax = axes[0]
out_table.plot(kind='box',ax=ax,rot=90,showfliers=False)

ax = axes[1]
out_table.replace(-np.inf,-1.5).plot(kind='hist',
                                            histtype='step',
                                            bins=100,ax=ax)
UT.hist_legend(ax,'Sample')
#ax.set_xticklabels(out_df.columns, rotation=90, )
plt.show()
print(out_table.shape)
(14550, 6)

Differential Expression Analysis

In [72]:
%%R
library(edgeR)
# Make groups
design_with_all <- model.matrix( ~0+group )
y <- DGEList(counts=indata, group = group)
y <- calcNormFactors(y)

y <- estimateDisp(y, design_with_all)
# Estimate dispersion
# Fit counts to model
fit_all <- glmQLFit( y, design_with_all )
In [73]:
%%R
contrastCT <- glmQLFTest( fit_all, contrast=makeContrasts( groupT-groupC, levels=design_with_all ) )
tableCT <- topTags(contrastCT, n=Inf, sort.by = "none", adjust.method="BH")$table
head(tableCT)
                  logFC     logCPM          F       PValue         FDR
Tb10.v4.0078 0.68208241  1.1979400 31.4642747 0.0003076558 0.005775989
Tb927.8.130  0.54529014 -1.9972317  2.7536979 0.1307481835 0.297656793
Tb927.8.150  0.12790506  1.6615998  1.7256210 0.2208592344 0.415260436
Tb927.8.270  0.05603467  4.8992681  0.7976839 0.3946026202 0.598037985
Tb927.8.320  0.38197957 -0.5992466  2.7341776 0.1319568375 0.299667861
Tb927.8.350  0.14397632 -0.8913057  0.5185434 0.4893960207 0.679716695
In [110]:
%%R 
t=topTags(contrastCT,n=20)
t$table
                    logFC   logCPM         F       PValue          FDR
Tb927.11.12100  1.8493620 7.554069 1318.2704 3.138994e-11 2.002750e-07
Tb427.BES129.4 -1.7663665 8.272147 1272.7517 3.684438e-11 2.002750e-07
Tb08.27P2.110   1.8447279 5.043706 1201.0892 4.798332e-11 2.002750e-07
Tb927.8.480     1.8664541 5.050719 1165.3766 5.505841e-11 2.002750e-07
KS17gene_4150a  1.9007535 6.767928  911.5365 1.684509e-10 4.901920e-07
Tb927.10.8460   0.9110964 7.288367  813.8274 2.820107e-10 5.425542e-07
Tb11.v5.0308    0.9221461 7.194702  804.8445 2.965918e-10 5.425542e-07
Tb11.v5.0309    0.9257243 7.210121  803.8206 2.983116e-10 5.425542e-07
Tb927.10.8470   0.8947758 7.199632  729.1806 4.642936e-10 7.506079e-07
Tb11.v5.0310    0.8945687 7.062380  698.4324 5.644954e-10 7.552422e-07
Tb427.BES129.2 -2.1027586 7.266928  688.9133 6.007363e-10 7.552422e-07
Tb11.v5.0330    0.9050035 7.628731  683.4347 6.228801e-10 7.552422e-07
Tb927.10.8480   0.8690081 7.423615  664.2419 7.087241e-10 7.847262e-07
Tb427.BES129.3 -2.1388530 7.183685  655.0227 7.550630e-10 7.847262e-07
Tb427VSG-6609  -1.8022921 7.239718  565.0476 1.473895e-09 1.429678e-06
Tb927.10.8440   0.9106215 6.979728  530.0612 1.967590e-09 1.789277e-06
Tb08.27P2.90    1.1418704 5.055913  473.2805 3.281235e-09 2.808351e-06
Tb927.10.10250  1.2245893 4.451152  408.0116 6.401112e-09 5.174232e-06
TRY.1045        1.8405730 1.890935  380.2339 8.787375e-09 6.431446e-06
Tb927.6.520     1.0783381 4.766301  376.9491 9.136446e-09 6.431446e-06
In [111]:
desc_dict['Tb08.27P2.90']
Out[111]:
'hypothetical protein'
In [74]:
%R -o tableCT

def mod_table(intable):
    intable['mlog10FDR']=-np.log10(intable['FDR'])
    intable['mlog10pvalue']=-np.log10(intable['PValue'])
    intable['desc']=[desc_dict.get(n,n) for n in intable.index.values]
    return intable

tableCT = mod_table(tableCT)
In [114]:
tableCT.sort_values('logFC').tail(10)
Out[114]:
logFC logCPM F PValue FDR mlog10FDR mlog10pvalue desc
Tb927.10.10250 1.224589 4.451152 408.011637 6.401112e-09 5.174232e-06 5.286154 8.193745 EP2 procyclin
MSTRG.272 1.282767 -1.940671 20.704398 1.317804e-03 1.480622e-02 1.829556 2.880149 MSTRG.272
Tb927.8.510 1.369908 0.964316 139.252149 7.467135e-07 1.805784e-04 3.743334 6.126846 hypothetical protein
MSTRG.271 1.526749 -0.861638 57.870300 2.961911e-05 1.460875e-03 2.835387 4.528428 MSTRG.271
Tb08.27P2.70 1.616963 1.638816 308.608014 2.238237e-08 1.144769e-05 4.941282 7.650094 hypothetical protein
TRY.1045 1.840573 1.890935 380.233853 8.787375e-09 6.431446e-06 5.191691 8.056141 TRY.1045
Tb08.27P2.110 1.844728 5.043706 1201.089160 4.798332e-11 2.002750e-07 6.698373 10.318910 PAP2 superfamily putative
Tb927.11.12100 1.849362 7.554069 1318.270355 3.138994e-11 2.002750e-07 6.698373 10.503210 RNA-binding protein putative
Tb927.8.480 1.866454 5.050719 1165.376576 5.505841e-11 2.002750e-07 6.698373 10.259176 phosphatidic acid phosphatase protein putative
KS17gene_4150a 1.900753 6.767928 911.536488 1.684509e-10 4.901920e-07 6.309634 9.773527 KS17gene_4150a
In [ ]:
 
In [75]:
for table,name in zip([tableCT],
                 ['TvsC',]):

    fig,axes=plt.subplots(figsize=(14,4), ncols=2, nrows=1)
    ax=axes[0]
    table.plot(x='logFC',y='mlog10FDR',
           kind='scatter',s=5,alpha=0.2,ax=ax,c='black')
    ax.set_xlim(-2.5,2.5)
    ax.set_title('Volcano')
    ax=axes[1]
    table.plot(x='logCPM',y='logFC',
           kind='scatter',s=5,alpha=0.2,ax=ax,c='black')
    ax.set_ylim(-2.5,2.5)
    ax.set_title('MA')
    plt.suptitle(name)
    plt.show()
In [128]:
table.loc['Tb927.11.12100']
Out[128]:
logFC                                 1.84936
logCPM                                7.55407
F                                     1318.27
PValue                            3.13899e-11
FDR                               2.00275e-07
mlog10FDR                             6.69837
mlog10pvalue                          10.5032
desc            RNA-binding protein  putative
Name: Tb927.11.12100, dtype: object
In [172]:
def annotated_volcano(table,name,selection=False):
    
    fig,axes=plt.subplots(figsize=(8,8), ncols=1, nrows=1)
    ax=axes
    if not selection:
        selection = table[((table['logFC']>1)|(table['logFC']<-1))
                          &(table['mlog10FDR']>2)&(table['logCPM']>0.8)]
    else:
        selection = table.loc[selection]
    annot_index = selection.index.values
    annot_names = selection['desc'].replace({'phosphatidic acid phosphatase protein  putative':
                                             'PAPP',
                                            'RNA-binding protein  putative':'RBP5',
                                            'PAP2 superfamily  putative':'PAP2',
                                            'THT1 - hexose transporter  putative':'THT1'})
    
    
    UT.make_vulcano(
        table,
        ax,
        x='logFC',
        y='mlog10FDR',
        fc_col = 'logFC',
        pval_col = 'FDR',
        pval_limit=0.01,
        fc_limit=1,
        annot_index=annot_index ,
        annot_names=annot_names,
        expand_text=(1.5, 1.5),
    expand_points=(2, 2),)
    ax.legend(loc='upper center', bbox_to_anchor=(0.8, 1.2), title='Legend')
    ax.set_xlim(-3,3)
    plt.title(name)
In [173]:
for table,name in zip([tableCT],
                 ['TvsC',]):
    annotated_volcano(table,name)
In [79]:
%%R
#https://rstudio-pubs-static.s3.amazonaws.com/79395_b07ae39ce8124a5c873bd46d6075c137.html
library(edgeR)
# Make groups
design_with_all <- model.matrix( ~0+group )

y <- DGEList(counts=indata, 
                  lib.size = sizeFactors,
                  group = group, 
                  )

y$offset <- cqn.subset$glm.offset
# Estimate dispersion
y <- estimateGLMCommonDisp( y, design_with_all )
y <- estimateGLMTrendedDisp( y, design_with_all )
y <- estimateGLMTagwiseDisp( y, design_with_all )
# Fit counts to model
fit_all <- glmQLFit( y, design_with_all )


#contrastDC <- glmLRT( fit_all, contrast=makeContrasts( groupD-groupC, levels=design_with_all ) )
#tableDC <- topTags(contrastDC, n=Inf, sort.by = "none", adjust.method="BH")$table
#head(tableDC)


contrastCT2 <- glmQLFTest( fit_all, contrast=makeContrasts( groupT-groupC, levels=design_with_all ) )
tableCT2 <- topTags(contrastCT2, n=Inf, sort.by = "none", adjust.method="BH")$table
head(tableCT2)
                 logFC     logCPM          F     PValue        FDR
Tb10.v4.0078 0.5679761  1.1985867 22.7284438 0.00119469 0.01454622
Tb927.8.130  0.5609122 -1.9978962  2.0997299 0.18319432 0.36729741
Tb927.8.150  0.1372703  1.6617231  1.4666977 0.25846314 0.45544855
Tb927.8.270  0.1107756  4.9005428  3.2623511 0.10632166 0.25969115
Tb927.8.320  0.5371784 -0.5995012  5.1069794 0.05185344 0.16538087
Tb927.8.350  0.2063682 -0.8920635  0.6428889 0.44450373 0.63890164
In [80]:
%R -o tableCT2
tableCT2 = mod_table(tableCT2)
In [174]:
annotated_volcano(tableCT2,'bias-corr')
In [150]:
for table,name in zip([tableCT2],
                 ['TvsC',]):

    fig,axes=plt.subplots(figsize=(14,4), ncols=2, nrows=1)
    ax=axes[0]
    table.plot(x='logFC',y='mlog10FDR',
           kind='scatter',s=5,alpha=0.2,ax=ax,c='black')
    ax.set_xlim(-2.5,2.5)
    ax.set_title('Volcano')
    ax=axes[1]
    table.plot(x='logCPM',y='logFC',
           kind='scatter',s=5,alpha=0.2,ax=ax,c='black')
    ax.set_ylim(-2.5,2.5)
    ax.set_title('MA')
    plt.suptitle(name)
    plt.show()
In [168]:
tableCT2[tableCT2['mlog10FDR']>4].sort_values('logFC').tail(20)
Out[168]:
logFC logCPM F PValue FDR mlog10FDR mlog10pvalue desc
Tb927.10.8450 0.888111 6.693319 416.996706 1.605209e-08 1.112180e-05 4.953825 7.794468 glucose transporter 1E
Tb11.v5.0310 0.951320 7.072681 660.971763 2.331964e-09 2.262005e-06 5.645506 8.632278 THT1 - hexose transporter putative
Tb927.10.8480 0.955106 7.433859 621.305533 3.024078e-09 2.750021e-06 5.560664 8.519407 glucose transporter putative
Tb927.10.8470 0.969711 7.209957 884.562380 6.843820e-10 9.957759e-07 6.001838 9.164701 glucose transporter putative
Tb927.10.8440 0.974905 6.989841 890.674490 6.648115e-10 9.957759e-07 6.001838 9.177302 glucose transporter 1B
Tb927.10.8460 0.990990 7.298830 1127.325102 2.461129e-10 8.952357e-07 6.048063 9.608866 glucose transporter putative
Tb11.v5.0308 0.992419 7.205389 831.771296 8.868775e-10 1.075339e-06 5.968455 9.052136 THT1 - hexose transporter putative
Tb11.v5.0309 0.996955 7.220800 962.803670 4.788056e-10 9.957759e-07 6.001838 9.319841 THT1 - hexose transporter putative
Tb11.v5.0330 1.023347 7.639434 584.672891 3.902499e-09 3.154520e-06 5.501067 8.408657 THT1 - hexose transporter putative
Tb927.6.520 1.077914 4.776290 332.583319 4.116702e-08 2.283486e-05 4.641402 7.385451 EP3-2 procyclin
Tb927.10.10260 1.116461 5.866959 213.409789 2.572384e-07 7.134875e-05 4.146614 6.589664 EP1 procyclin
Tb08.27P2.90 1.156797 5.066471 723.135915 1.598257e-09 1.661046e-06 5.779618 8.796353 hypothetical protein
Tb927.10.10250 1.208584 4.461750 388.728476 2.151132e-08 1.422680e-05 4.846893 7.667333 EP2 procyclin
Tb927.10.2010 1.232789 10.691983 1348.719586 1.154106e-10 8.396121e-07 6.075921 9.937754 hexokinase
Tb08.27P2.70 1.427606 1.642473 302.526478 6.098771e-08 2.609915e-05 4.583374 7.214758 hypothetical protein
KS17gene_4150a 1.806713 6.787228 1161.526336 2.169405e-10 8.952357e-07 6.048063 9.663659 KS17gene_4150a
Tb08.27P2.110 1.817478 5.058759 953.561975 4.986768e-10 9.957759e-07 6.001838 9.302181 PAP2 superfamily putative
Tb927.11.12100 1.827262 7.573026 1495.196260 7.464141e-11 8.396121e-07 6.075921 10.127020 RNA-binding protein putative
Tb927.8.480 1.838001 5.066011 932.595174 5.476743e-10 9.957759e-07 6.001838 9.261478 phosphatidic acid phosphatase protein putative
TRY.1045 1.848452 1.895358 346.781115 3.460189e-08 2.097740e-05 4.678248 7.460900 TRY.1045
In [146]:
tableCT2[tableCT2['mlog10FDR']>4].sort_values('logFC').head(5)
Out[146]:
logFC logCPM F PValue FDR mlog10FDR mlog10pvalue desc
Tb427.BES129.3 -2.037388 7.163869 602.887424 3.431192e-09 0.000003 5.532141 8.464555 Tb427.BES129.3
Tb427.BES129.2 -1.981223 7.247237 567.734127 4.414586e-09 0.000003 5.471001 8.355110 Tb427.BES129.2
Tb427VSG-6609 -1.527234 7.222347 307.388898 5.708681e-08 0.000025 4.599115 7.243464 Tb427VSG-6609
Tb427.BES129.4 -1.499699 8.254239 833.109465 8.808944e-10 0.000001 5.968455 9.055076 Tb427.BES129.4
TRY.133 -0.578674 6.100852 212.875800 2.598958e-07 0.000071 4.146614 6.585201 TRY.133
In [177]:
Image('Figures/Tb427.BES129.3_paperFig.png')
Out[177]:
In [178]:
Image('Figures/Tb927.11.12100_paperFig.png')
Out[178]:
In [179]:
Image('Figures/Tb927.8.480_paperFig.png')
Out[179]:
In [180]:
Image('Figures/Tb08.27P2.110_paperFig.png')
Out[180]:
In [181]:
Image('Figures/Tb927.10.2010_paperFig.png')
Out[181]:
In [183]:
!jupyter nbconvert --to html_toc FiguresPaper.ipynb
[NbConvertApp] Converting notebook FiguresShare.ipynb to html_toc
[NbConvertApp] Support files will be in FiguresShare_files/
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Making directory FiguresShare_files
[NbConvertApp] Writing 360557 bytes to FiguresShare.html
In [ ]: