import seaborn as sns
import gpplot
import matplotlib.pyplot as plt
To demonstrate the functionality of this module we'll use read counts from Sanson et al. 2018.
supp_reads = 'https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-018-07901-8/MediaObjects/41467_2018_7901_MOESM4_ESM.xlsx'
read_counts = pd.read_excel(supp_reads,
sheet_name = 'A375_orig_tracr raw reads',
header = None,
skiprows = 3,
names = ['sgRNA Sequence', 'pDNA', 'A375_RepA', 'A375_RepB'],
engine='openpyxl')
read_counts
Note that the read counts have a long right tail
sns.kdeplot(read_counts['pDNA'])
To mitigate this tail and account for sequencing biases we log-normalize read counts by taking $$\log_2 \bigg ( \frac{\text{reads}}{\text{total reads in condition}} * 10^6 + 1 \bigg )$$
lognorms = lognorm_columns(reads_df=read_counts, columns=['pDNA', 'A375_RepA', 'A375_RepB'])
sns.kdeplot(lognorms['pDNA'])
Note that there's a small number of sgRNAs with low pDNA abundance. We'll filter out any sgRNAs with lognormed pDNA less than the mean minus 3 standard deviations.
filtered_lognorms = filter_pdna(lognorm_df=lognorms, pdna_cols=['pDNA'])
lognorms.shape[0] - filtered_lognorms.shape[0]
sns.kdeplot(filtered_lognorms['pDNA'])
assert ((filtered_lognorms['pDNA'] - lognorms['pDNA'].mean())/lognorms['pDNA'].std()).min() > -3
We'll calculate log-fold changes from the pDNA reference. Since we only have one reference condition we can supply the calculate_lfcs function with the ref_col and target_cols arguments.
lfc_df = calculate_lfcs(lognorm_df=filtered_lognorms, ref_col='pDNA', target_cols=['A375_RepA', 'A375_RepB'])
Alternatively we can supply a ref_map, which is useful when we have more than one reference condition
lfc_df2 = calculate_lfcs(filtered_lognorms, ref_map = {'A375_RepA': 'pDNA', 'A375_RepB': 'pDNA'})
assert lfc_df.equals(lfc_df2)
We can use seaborn's clustermap function to plot a clustermap of replicate correlations
sns.clustermap(lfc_df.corr(), vmin= 0, cmap= 'viridis', annot= True, figsize= (4,4))
Since we only have two conditions it's also easy to visualize replicates as a point densityplot using gpplot
plt.subplots(figsize=(4,4))
gpplot.point_densityplot(data=lfc_df, x='A375_RepA', y='A375_RepB')
gpplot.add_correlation(data=lfc_df, x='A375_RepA', y='A375_RepB')
sns.despine()
Since we see a strong correlation, we'll average the log-fold change of each sgRNA across replicates
avg_replicate_lfc_df = average_replicate_lfcs(lfcs=lfc_df, guide_col='sgRNA Sequence', condition_indices=[0],
sep='_')
assert np.abs(avg_replicate_lfc_df.shape[0] - read_counts.shape[0]) < 1000
We can alternatively combine replicates by supplying a condition_map
avg_replicate_lfc_df_condition_map = average_replicate_lfcs(lfcs=lfc_df, guide_col='sgRNA Sequence',
condition_map={'A375_RepA': 'A375',
'A375_RepB': 'A375'})
pd.testing.assert_frame_equal(avg_replicate_lfc_df_condition_map, avg_replicate_lfc_df)
If we don't want to join replicates, we don't have to supply these arguments
distinct_replicate_lfc_df = average_replicate_lfcs(lfcs=lfc_df, guide_col='sgRNA Sequence')
assert distinct_replicate_lfc_df.shape[0] == avg_replicate_lfc_df.shape[0]*2
Before combining sgRNAs at the gene level, it's sometimes helpful to group controls into pseudo-genes so they're easier to compare with target genes
guide_annotations = pd.read_excel(supp_reads,
sheet_name='sgRNA annotations',
engine='openpyxl')
guide_annotations
remapped_annotations = group_pseudogenes(annotations=guide_annotations, pseudogene_size=4,
gene_col='Annotated Gene Symbol',
control_regex=['NO_CURRENT'])
control_genes = remapped_annotations.loc[remapped_annotations['Annotated Gene Symbol'].str.contains('_'), 'Annotated Gene Symbol']
assert control_genes.value_counts().max() == 4
assert control_genes.value_counts().min() == 4
assert len(control_genes) == 1000
remapped_annotations
We provide two methods for scaling log-fold change values to controls:
- Z-score from a set of negative controls
- Scale scores between a set of negative and positive controls
For both scoring methods, you can input either a regex or a list of genes to define control sets
nonessential_genes = (pd.read_table('https://raw.githubusercontent.com/gpp-rnd/genesets/master/human/non-essential-genes-Hart2014.txt',
names=['gene'])
.gene)
essential_genes = (pd.read_table('https://raw.githubusercontent.com/gpp-rnd/genesets/master/human/essential-genes-Hart2015.txt',
names=['gene'])
.gene)
annotated_lfcs = get_annotated_lfcs(avg_replicate_lfc_df, remapped_annotations, merge_on='sgRNA Sequence')
assert annotated_lfcs.shape[0] == avg_replicate_lfc_df.shape[0]
no_current_controls = get_control_lfcs(annotated_lfcs, 'NO_CURRENT', 'Annotated Gene Symbol')
assert abs(no_current_controls.shape[0] - 1000) < 20
nonessential_controls = get_control_lfcs(annotated_lfcs, nonessential_genes, 'Annotated Gene Symbol')
assert abs(nonessential_controls.shape[0] - len(nonessential_genes) * 4) < 200
z_scored_lfcs = get_neg_ctl_z_score(annotated_lfcs, nonessential_genes, 'Annotated Gene Symbol', 'condition', 'avg_lfc')
control_z_scores = z_scored_lfcs.loc[z_scored_lfcs['Annotated Gene Symbol'].isin(nonessential_genes), 'z_scored_avg_lfc']
assert np.abs(control_z_scores.mean()) < 0.001
assert np.abs(control_z_scores.std() - 1) < 0.001
scaled_lfcs = scale_lfcs(annotated_lfcs, nonessential_genes, essential_genes, 'Annotated Gene Symbol', 'condition', 'avg_lfc', -1)
pos_scaled_lfcs = scaled_lfcs[scaled_lfcs['Annotated Gene Symbol'].isin(essential_genes)]
assert pos_scaled_lfcs['control_scaled_avg_lfc'].median() == -1
neg_scaled_lfcs = scaled_lfcs[scaled_lfcs['Annotated Gene Symbol'].isin(nonessential_genes)]
assert neg_scaled_lfcs['control_scaled_avg_lfc'].median() == 0
assert pos_scaled_lfcs['probability_positive_control'].mean() > neg_scaled_lfcs['probability_positive_control'].mean()
All of these functions are wrapped together in annotate_guide_lfcs
annot_guide_lfcs = annotate_guide_lfcs(avg_replicate_lfc_df, remapped_annotations, 'Annotated Gene Symbol',
merge_on='sgRNA Sequence',
z_score_neg_ctls=True, z_score_neg_ctl_genes=nonessential_genes,
scale_ctls=True, scale_neg_ctl_genes=nonessential_genes,
scale_pos_ctl_genes=essential_genes, pos_control_direction=-1)
annot_guide_lfcs.sort_values('z_scored_avg_lfc')
plot_lfcs = annot_guide_lfcs.copy()
plot_lfcs['neg_control'] = plot_lfcs['Annotated Gene Symbol'].isin(nonessential_genes)
plot_lfcs['nlog10_pos_ctrl'] = -np.log10(plot_lfcs['probability_positive_control'])
plot_lfcs = plot_lfcs.sort_values('neg_control')
plt.subplots(figsize=(4,4))
sns.scatterplot(data=plot_lfcs, x='z_scored_avg_lfc',
y='control_scaled_avg_lfc', hue='neg_control', alpha=0.3)
sns.despine()
To aggregate scores at the gene level, we specify columns to average, and columns to z-score
gene_lfcs = aggregate_gene_lfcs(annot_guide_lfcs, 'Annotated Gene Symbol',
average_cols=['avg_lfc', 'control_scaled_avg_lfc', 'probability_positive_control'],
zscore_cols=['z_scored_avg_lfc'])
assert np.abs(gene_lfcs.shape[0] - 20000) < 1000
# Do we score about the expect number of essential genes?
assert np.abs((gene_lfcs['probability_positive_control'] > 0.5).sum()/20000 - 0.2) < 0.1
assert gene_lfcs.sort_values('probability_positive_control', ascending=False).head(5)['Annotated Gene Symbol'].isin(essential_genes).sum() > 2
assert gene_lfcs[gene_lfcs.z_scored_avg_lfc < 0].sort_values('z_scored_avg_lfc_fdr', ascending=True).head(5)['Annotated Gene Symbol'].isin(essential_genes).sum() > 2
# Do the two scoring mechanisms agree
gene_lfcs_copy = gene_lfcs.copy()
gene_lfcs_copy['z_essential'] = (gene_lfcs_copy['z_scored_avg_lfc_fdr'] < 0.1) & (gene_lfcs_copy['z_scored_avg_lfc'] < 0)
gene_lfcs_copy['p_essential'] = (gene_lfcs_copy['probability_positive_control'] > 0.5)
scoring_overlap = (gene_lfcs_copy[['z_essential', 'p_essential']].value_counts()
.reset_index()
.rename({0: 'n'}, axis=1))
agree = scoring_overlap.loc[(scoring_overlap['z_essential'] == scoring_overlap['p_essential']), 'n'].sum()
disagree = scoring_overlap.loc[(scoring_overlap['z_essential'] != scoring_overlap['p_essential']), 'n'].sum()
accuracy = agree/(agree + disagree)
assert accuracy > 0.95
# Are controls centered around 0
control_z_scores = gene_lfcs.loc[gene_lfcs['Annotated Gene Symbol'].isin(nonessential_genes), 'z_scored_avg_lfc']
assert np.abs(control_z_scores.mean()) < 0.1
assert np.abs(control_z_scores.std() - 1) < 0.3
gene_lfcs.sort_values('avg_lfc')
We see that our negative control population, which are the nonessential genes in this case, are grouped around 0
plot_lfcs = gene_lfcs.copy()
plot_lfcs['nlog10_fdr'] = -np.log10(plot_lfcs['z_scored_avg_lfc_fdr'])
plot_lfcs['neg_control'] = plot_lfcs['Annotated Gene Symbol'].isin(nonessential_genes)
plot_lfcs = plot_lfcs.sort_values('neg_control')
plt.subplots(figsize=(4,4))
sns.scatterplot(data=plot_lfcs, x='z_scored_avg_lfc',
y='nlog10_fdr', hue='neg_control', alpha=0.3)
sns.despine()
We can also calculate the gene level p-values using the hypergeometric test as follows:
gene_lfcs_hyp = aggregate_gene_lfcs_hypergeometric(annot_guide_lfcs, 'Annotated Gene Symbol',
average_cols=['avg_lfc'])
gene_lfcs_hyp
plot_lfcs_hyp = gene_lfcs_hyp.copy()
plot_lfcs_hyp = plot_lfcs_hyp[(plot_lfcs_hyp.n_guides>=3)&(plot_lfcs_hyp.n_guides<=5)]
hyp_ctrls = plot_lfcs_hyp[plot_lfcs_hyp['Annotated Gene Symbol'].isin(nonessential_genes)]
plt.subplots(figsize=(4,4))
sns.scatterplot(data=plot_lfcs_hyp, x='avg_lfc',
y='avg_lfc_hypergeometric_test_avg_pval')
sns.scatterplot(data=hyp_ctrls, x='avg_lfc',
y='avg_lfc_hypergeometric_test_avg_pval')
sns.despine()
Finally, to evaluate the quality each screen, we'll calculate the ROC-AUC between essential and nonessential genes for each condition
We can calculate ROC-AUCs three different ways:
1) We can specify a condition_col
and score_col
roc_aucs, roc_df = get_roc_aucs(lfcs=gene_lfcs, tp_genes=essential_genes, fp_genes=nonessential_genes, gene_col='Annotated Gene Symbol',
condition_col='condition', score_col='avg_lfc')
assert roc_aucs['ROC-AUC'][0] > 0.9
roc_aucs
plt.subplots(figsize=(4,4))
sns.lineplot(data=roc_df, x='fpr', y='tpr', hue='condition', ci=None)
sns.despine()
We can also specify tp_genes
and fp_genes
using regular expressions. This is useful for distinguishing
between different control types defined by a regex.
roc_aucs, _ = get_roc_aucs(lfcs=gene_lfcs, tp_genes='RPS', fp_genes='CD', gene_col='Annotated Gene Symbol',
condition_col='condition', score_col='avg_lfc')
assert roc_aucs['ROC-AUC'][0] > 0.8
2) Since we only have one condition we can also just supply the score_col
roc_aucs, roc_df = get_roc_aucs(lfcs=gene_lfcs, tp_genes=essential_genes, fp_genes=nonessential_genes, gene_col='Annotated Gene Symbol',
score_col='avg_lfc')
roc_aucs
plt.subplots(figsize=(4,4))
sns.lineplot(data=roc_df, x='fpr', y='tpr', ci=None)
sns.despine()
Finally, we can specify condition_list
as a list of columns we want to calculate AUCs for
roc_aucs, roc_df = get_roc_aucs(lfcs=gene_lfcs, tp_genes=essential_genes, fp_genes=nonessential_genes, gene_col='Annotated Gene Symbol',
condition_list=['avg_lfc'])
roc_aucs
This is a useful approach when we want to quantify ROC-AUC at the guide level say
annotated_guide_lfcs = lfc_df.merge(guide_annotations, how='inner', on='sgRNA Sequence')
roc_aucs, roc_df = get_roc_aucs(lfcs=annotated_guide_lfcs, tp_genes=essential_genes, fp_genes=nonessential_genes, gene_col='Annotated Gene Symbol',
condition_list=['A375_RepA', 'A375_RepB'])
roc_aucs
plt.subplots(figsize=(4,4))
sns.lineplot(data=roc_df, x='fpr', y='tpr', hue='condition', ci=None)
sns.despine()
We can see this visually as well that essentials deplete the most followed by nonessentials and control genes
plt.subplots(figsize=(4,4))
sns.kdeplot(gene_lfcs.loc[gene_lfcs['Annotated Gene Symbol'].isin(essential_genes), 'z_scored_avg_lfc'], label='essential')
sns.kdeplot(gene_lfcs.loc[gene_lfcs['Annotated Gene Symbol'].isin(nonessential_genes), 'z_scored_avg_lfc'], label='non-essential')
sns.kdeplot(gene_lfcs.loc[gene_lfcs['Annotated Gene Symbol'].str.contains('_'), 'z_scored_avg_lfc'], label='control')
sns.despine()
plt.legend()