Rule Set 3 sequence-based model
import lightgbm
from scipy.stats import spearmanr
import gpplot
import matplotlib.pyplot as plt
import seaborn as sns
geckov2_data = pd.read_csv('test_data/Aguirre2016_activity.csv')
sanger_data = pd.read_csv('test_data/Behan2019_activity.csv')
__file__ = os.path.abspath('') + '/00_seq.ipynb'

load_seq_model[source]

load_seq_model()

Load rule set 3 sequence model

assert type(load_seq_model()) == lightgbm.sklearn.LGBMRegressor

featurize_context[source]

featurize_context(context_sequences, sequence_tracr='Hsu2013', ref_tracrs=None, n_jobs=1)

Featurize context sequences

:param context_sequences: list-like :param sequence_tracr: list-like or str :return: DataFrame, feature matrix

test_sgrnas = ['GACGAAAGCGACAACGCGTTCATCCGGGCA', 'AGAAAACACTAGCATCCCCACCCGCGGACT']
sequence_tracr = ['Hsu2013', 'Chen2013']
featurized_test = featurize_context(test_sgrnas, sequence_tracr=sequence_tracr)
assert featurized_test.loc['GACGAAAGCGACAACGCGTTCATCCGGGCA', '-4G'] == 1
assert featurized_test.loc['AGAAAACACTAGCATCCCCACCCGCGGACT', '1AA'] == 1
assert featurized_test.loc['GACGAAAGCGACAACGCGTTCATCCGGGCA', 'Chen2013 tracr'] == 0
assert featurized_test.loc['GACGAAAGCGACAACGCGTTCATCCGGGCA', 'Hsu2013 tracr'] == 1
assert featurized_test.loc['AGAAAACACTAGCATCCCCACCCGCGGACT', 'Chen2013 tracr'] == 1
assert featurized_test.loc['AGAAAACACTAGCATCCCCACCCGCGGACT', 'Hsu2013 tracr'] == 0
featurized_test = featurize_context(test_sgrnas, sequence_tracr='Hsu2013')
assert featurized_test.loc['GACGAAAGCGACAACGCGTTCATCCGGGCA', 'Hsu2013 tracr'] == 1
assert featurized_test.loc['GACGAAAGCGACAACGCGTTCATCCGGGCA', 'Chen2013 tracr'] == 0
assert featurized_test.loc['AGAAAACACTAGCATCCCCACCCGCGGACT', 'Hsu2013 tracr'] == 1
assert featurized_test.loc['AGAAAACACTAGCATCCCCACCCGCGGACT', 'Chen2013 tracr'] == 0
100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 144.95it/s]
100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 247.74it/s]

predict_seq[source]

predict_seq(context_sequences, sequence_tracr='Hsu2013', ref_tracrs=None, n_jobs=1)

Predict the activity of context sequence for SpCas9 Knockout using sequence information only

:param context_sequences: list of str :return: list of float, predictions

geckov2_predictions = predict_seq(geckov2_data['sgRNA Context Sequence'],
                                  sequence_tracr='Hsu2013')
assert spearmanr(geckov2_predictions, geckov2_data['avg_mean_centered_neg_lfc'])[0] > 0.5
Calculating sequence-based features
100%|██████████████████████████████████████| 8659/8659 [00:14<00:00, 612.35it/s]
sanger_predictions = predict_seq(sanger_data['sgRNA Context Sequence'],
                                 sequence_tracr='Chen2013')
assert spearmanr(sanger_predictions, sanger_data['avg_mean_centered_neg_lfc'])[0] > 0.25
Calculating sequence-based features
100%|██████████████████████████████████████| 7442/7442 [00:12<00:00, 610.30it/s]
geckov2_activity_df = pd.DataFrame({'activity': geckov2_data['avg_mean_centered_neg_lfc'],
                                    'prediction': geckov2_predictions})
plt.subplots(figsize=(4, 4))
sns.despine()
gpplot.point_densityplot(data=geckov2_activity_df, y='activity', x='prediction')
gpplot.add_correlation(data=geckov2_activity_df, y='activity', x='prediction', method='spearman')
<Axes: xlabel='prediction', ylabel='activity'>
No description has been provided for this image