Rule Set 3 sequence-based model
import lightgbm
from scipy.stats import spearmanr
import gpplot
import matplotlib.pyplot as plt
import seaborn as sns
geckov2_data = pd.read_csv('test_data/Aguirre2016_activity.csv')
sanger_data = pd.read_csv('test_data/Behan2019_activity.csv')
__file__ = os.path.abspath('') + '/00_seq.ipynb'
assert type(load_seq_model()) == lightgbm.sklearn.LGBMRegressor
test_sgrnas = ['GACGAAAGCGACAACGCGTTCATCCGGGCA', 'AGAAAACACTAGCATCCCCACCCGCGGACT']
sequence_tracr = ['Hsu2013', 'Chen2013']
featurized_test = featurize_context(test_sgrnas, sequence_tracr=sequence_tracr)
assert featurized_test.loc['GACGAAAGCGACAACGCGTTCATCCGGGCA', '-4G'] == 1
assert featurized_test.loc['AGAAAACACTAGCATCCCCACCCGCGGACT', '1AA'] == 1
assert featurized_test.loc['GACGAAAGCGACAACGCGTTCATCCGGGCA', 'Chen2013 tracr'] == 0
assert featurized_test.loc['GACGAAAGCGACAACGCGTTCATCCGGGCA', 'Hsu2013 tracr'] == 1
assert featurized_test.loc['AGAAAACACTAGCATCCCCACCCGCGGACT', 'Chen2013 tracr'] == 1
assert featurized_test.loc['AGAAAACACTAGCATCCCCACCCGCGGACT', 'Hsu2013 tracr'] == 0
featurized_test = featurize_context(test_sgrnas, sequence_tracr='Hsu2013')
assert featurized_test.loc['GACGAAAGCGACAACGCGTTCATCCGGGCA', 'Hsu2013 tracr'] == 1
assert featurized_test.loc['GACGAAAGCGACAACGCGTTCATCCGGGCA', 'Chen2013 tracr'] == 0
assert featurized_test.loc['AGAAAACACTAGCATCCCCACCCGCGGACT', 'Hsu2013 tracr'] == 1
assert featurized_test.loc['AGAAAACACTAGCATCCCCACCCGCGGACT', 'Chen2013 tracr'] == 0
geckov2_predictions = predict_seq(geckov2_data['sgRNA Context Sequence'],
sequence_tracr='Hsu2013')
assert spearmanr(geckov2_predictions, geckov2_data['avg_mean_centered_neg_lfc'])[0] > 0.5
sanger_predictions = predict_seq(sanger_data['sgRNA Context Sequence'],
sequence_tracr='Chen2013')
assert spearmanr(sanger_predictions, sanger_data['avg_mean_centered_neg_lfc'])[0] > 0.25
geckov2_activity_df = pd.DataFrame({'activity': geckov2_data['avg_mean_centered_neg_lfc'],
'prediction': geckov2_predictions})
plt.subplots(figsize=(4, 4))
sns.despine()
gpplot.point_densityplot(data=geckov2_activity_df, y='activity', x='prediction')
gpplot.add_correlation(data=geckov2_activity_df, y='activity', x='prediction', method='spearman')