Rule set 3 target-site predictions
import lightgbm
import pandas as pd
from rs3 import targetdata
from scipy import stats
import numpy as np
__file__ = os.path.abspath('') + '/03_predicttarg.ipynb'
import multiprocessing
max_n_jobs = multiprocessing.cpu_count()

load_target_model[source]

load_target_model(lite=False)

Load rule set 3 target model

assert type(load_target_model()['regressor']) == lightgbm.sklearn.LGBMRegressor
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
  warnings.warn(
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
  warnings.warn(

predict_target[source]

predict_target(design_df, aa_subseq_df, domain_feature_df=None, conservation_feature_df=None, id_cols=None)

Make predictions using the Rule Set 3 target model. Note that if the protein_domain_df or conservation_df are not supplied, then the lite model will be used, otherwise the full model is used.

:param design_df: DataFrame :param aa_subseq_df: DataFrame :param domain_feature_df: DataFrame :param id_cols: list or str :return: list

design_df = pd.read_table('test_data/sgrna-designs.txt')
design_targ_df = targetfeat.add_target_columns(design_df)
id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']
aa_seq_df = targetdata.build_transcript_aa_seq_df(design_df, n_jobs=2)
aa_subseq_df = targetfeat.get_aa_subseq_df(sg_designs=design_targ_df, aa_seq_df=aa_seq_df, width=16,
                                           id_cols=id_cols)
aa_subseq_df
Getting amino acid sequences
100%|██████████| 4/4 [00:04<00:00,  1.04s/it]
Target Transcript Target Total Length Transcript Base desc molecule seq id version AA len sgRNA Context Sequence AA Index Target Cut Length Orientation extended_seq AA 0-Indexed AA 0-Indexed padded seq_start seq_end AA Subsequence
0 ENST00000259457.8 834 ENST00000259457 None protein MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI... ENSP00000259457 3 277 TGGAGCAGATACAAGAGCAACTGAAGGGAT 64 191 sense -----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF... 63 80 64 96 GVVYKDGIVLGADTRATEGMVVADKNCSKIHFI
1 ENST00000259457.8 834 ENST00000259457 None protein MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI... ENSP00000259457 3 277 CCGGAAAACTGGCACGACCATCGCTGGGGT 46 137 sense -----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF... 45 62 46 78 AKRGYKLPKVRKTGTTIAGVVYKDGIVLGADTR
2 ENST00000394249.8 1863 ENST00000394249 None protein MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH... ENSP00000377793 3 620 TAGAAAAAGATTTGCGCACCCAAGTGGAAT 106 316 sense -----------------MRRSEVLAEESIVCLQKALNHLREIWELI... 105 122 106 138 EEGETTILQLEKDLRTQVELMRKQKKERKQELK
3 ENST00000394249.8 1863 ENST00000394249 None protein MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH... ENSP00000377793 3 620 TGGCCTTTGACCCAGACATAATGGTGGCCA 263 787 antisense -----------------MRRSEVLAEESIVCLQKALNHLREIWELI... 262 279 263 295 WDRLQIPEEEREAVATIMSGSKAKVRKALQLEV
4 ENST00000361337.3 2298 ENST00000361337 None protein MSGDHLHNDSQIEADFRLNDSHKHKDKHKDREHRHKEHKKEKDREK... ENSP00000354522 2 765 AAATACTCACTCATCCTCATCTCGAGGTCT 140 420 antisense -----------------MSGDHLHNDSQIEADFRLNDSHKHKDKHK... 139 156 140 172 GYFVPPKEDIKPLKRPRDEDDADYKPKKIKTED
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
395 ENST00000454402.7 1023 ENST00000454402 None protein METSALKQQEQPAATKIRNLPWVEKYRPQTLNDLISHQDILSTIQK... ENSP00000408295 2 340 TGTCTTTATATAGCTGTTTCGCACAGGCTA 74 220 antisense -----------------METSALKQQEQPAATKIRNLPWVEKYRPQ... 73 90 74 106 LYGPPGTGKTSTILACAKQLYKDKEFGSMVLEL
396 ENST00000254998.3 423 ENST00000254998 None protein MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV... ENSP00000254998 2 140 TTGTCAATGTCTACTACACCACCATGGATA 27 79 sense -----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD... 26 43 27 59 DQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTA
397 ENST00000254998.3 423 ENST00000254998 None protein MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV... ENSP00000254998 2 140 GGCGTTTGCTGTCCCGCCTGTACATGGGCA 39 115 sense -----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD... 38 55 39 71 VYYTTMDKRRRLLSRLYMGTATLVWNGNAVSGQ
398 ENST00000381685.10 2067 ENST00000381685 None protein MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI... ENSP00000371101 5 688 ACTAGCAATGGCTTATCAGATCGAAGGTCA 259 776 antisense -----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK... 258 275 259 291 TMAVGTTTGQVLLYDLRSDKPLLVKDHQYGLPI
399 ENST00000381685.10 2067 ENST00000381685 None protein MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI... ENSP00000371101 5 688 AAATTTTGTCTGATGACTACTCAAAGGTAT 108 322 sense -----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK... 107 124 108 140 CLDSEVVTFEILSDDYSKIVFLHNDRYIEFHSQ

400 rows × 19 columns

domain_df = targetdata.build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=2)
domain_feature_df = targetfeat.get_protein_domain_features(design_targ_df, domain_df, sources=None,
                                                           id_cols=id_cols)
Getting protein domains
100%|██████████| 200/200 [00:48<00:00,  4.12it/s]
conservation_df = targetdata.build_conservation_df(design_df, n_jobs=max_n_jobs)
conservation_feature_df = targetfeat.get_conservation_features(design_targ_df, conservation_df,
                                                             small_width=2, large_width=16,
                                                             conservation_column='ranked_conservation',
                                                             id_cols=id_cols)
conservation_feature_df
Getting conservation
100%|██████████| 200/200 [03:53<00:00,  1.17s/it]
sgRNA Context Sequence Target Cut Length Target Transcript Orientation cons_4 cons_32
0 AAAAGAATGATGAAAAGACACCACAGGGAG 244 ENST00000610426.5 sense 0.218231 0.408844
1 AAAAGAGCCATGAATCTAAACATCAGGAAT 640 ENST00000223073.6 sense 0.129825 0.278180
2 AAAAGCGCCAAATGGCCCGAGAATTGGGAG 709 ENST00000331923.9 sense 0.470906 0.532305
3 AAACAGAAAAAGTTAAAATCACCAAGGTGT 496 ENST00000283882.4 sense 0.580556 0.602708
4 AAACAGATGGAAGATGCTTACCGGGGGACC 132 ENST00000393047.8 sense 0.283447 0.414293
... ... ... ... ... ... ...
395 TTTGATTGCATTAAGGTTGGACTCTGGATT 246 ENST00000249269.9 sense 0.580612 0.618707
396 TTTGCCCACAGCTCCAAAGCATCGCGGAGA 130 ENST00000227618.8 sense 0.323770 0.416368
397 TTTTACAGTGCGATGTATGATGTATGGCTT 119 ENST00000338366.6 sense 0.788000 0.537417
398 TTTTGGATCTCGTAGTGATTCAAGAGGGAA 233 ENST00000629496.3 sense 0.239630 0.347615
399 TTTTTGTTACTACAGGTTCGCTGCTGGGAA 201 ENST00000395840.6 sense 0.693767 0.639044

400 rows × 6 columns

predictions = predict_target(design_df=design_df,
                             aa_subseq_df=aa_subseq_df,
                             domain_feature_df=domain_feature_df,
                             conservation_feature_df=conservation_feature_df)
design_df['Target Score'] = predictions
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
  warnings.warn(
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
  warnings.warn(
lite_predictions = predict_target(design_df=design_df,
                                  aa_subseq_df=aa_subseq_df)
design_df['Target Score Lite'] = lite_predictions
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
  warnings.warn(
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
  warnings.warn(
design_df['sgRNA Context Sequence']
0      TGGAGCAGATACAAGAGCAACTGAAGGGAT
1      CCGGAAAACTGGCACGACCATCGCTGGGGT
2      TAGAAAAAGATTTGCGCACCCAAGTGGAAT
3      TGGCCTTTGACCCAGACATAATGGTGGCCA
4      AAATACTCACTCATCCTCATCTCGAGGTCT
                    ...              
395    TGTCTTTATATAGCTGTTTCGCACAGGCTA
396    TTGTCAATGTCTACTACACCACCATGGATA
397    GGCGTTTGCTGTCCCGCCTGTACATGGGCA
398    ACTAGCAATGGCTTATCAGATCGAAGGTCA
399    AAATTTTGTCTGATGACTACTCAAAGGTAT
Name: sgRNA Context Sequence, Length: 400, dtype: object
assert stats.pearsonr(design_df['Target Score'], design_df['Target Score Lite'])[0] > 0.7
sanger_df = pd.read_csv('test_data/Behan2019_activity.csv')
gecko_df = pd.read_csv('test_data/Aguirre2016_activity.csv')

sanger_designs = sanger_df.merge(design_df, how='inner',
                                 on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',
                                     'Target Cut %'])
gecko_designs = gecko_df.merge(design_df, how='inner',
                                on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',
                                    'Target Cut %'])
assert stats.pearsonr(sanger_designs['avg_mean_centered_neg_lfc'],
                      sanger_designs['Target Score'])[0] > 0.2
assert stats.pearsonr(gecko_designs['avg_mean_centered_neg_lfc'],
                      gecko_designs['Target Score'])[0] > 0.05
rs_dev_target_lite_predictions = (pd.read_csv('test_data/target_lite_score_export.csv')
                                  .rename({'Target Lite Score': 'Target Score Lite'}, axis=1))
rs_dev_target_predictions = pd.read_csv('test_data/target_score_export.csv')
merged_rs_dev_predictions = rs_dev_target_lite_predictions.merge(rs_dev_target_predictions,
                                                                 how='inner')
merged_rs_dev_rs3_predictions = (design_df
                                 .merge(merged_rs_dev_predictions,
                                        how='inner',
                                        on=['sgRNA Context Sequence', 'Target Cut Length',
                                            'Target Transcript', 'Orientation'],
                                        suffixes=[' rs3', ' rs_dev']))
assert np.allclose(merged_rs_dev_rs3_predictions['Target Score rs3'], merged_rs_dev_rs3_predictions['Target Score rs_dev'])
assert np.allclose(merged_rs_dev_rs3_predictions['Target Score Lite rs3'], merged_rs_dev_rs3_predictions['Target Score Lite rs_dev'])