Rule set 3 target-site predictions
import lightgbm
import pandas as pd
from rs3 import targetdata
from scipy import stats
import numpy as np
__file__ = os.path.abspath('') + '/03_predicttarg.ipynb'
import multiprocessing
max_n_jobs = multiprocessing.cpu_count()

load_target_model[source]

load_target_model(lite=False)

Load rule set 3 target model

assert type(load_target_model()['regressor']) == lightgbm.sklearn.LGBMRegressor
/Users/peterdeweirdt/miniforge3/envs/test_rs3_v6/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 1.0.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
  warnings.warn(
/Users/peterdeweirdt/miniforge3/envs/test_rs3_v6/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 1.0.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
  warnings.warn(

predict_target[source]

predict_target(design_df, aa_subseq_df, domain_feature_df=None, conservation_feature_df=None, id_cols=None)

Make predictions using the Rule Set 3 target model. Note that if the protein_domain_df or conservation_df are not supplied, then the lite model will be used, otherwise the full model is used.

:param design_df: DataFrame :param aa_subseq_df: DataFrame :param domain_feature_df: DataFrame :param id_cols: list or str :return: list

design_df = pd.read_table('test_data/sgrna-designs.txt')
design_targ_df = targetfeat.add_target_columns(design_df)
id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']
aa_seq_df = targetdata.build_transcript_aa_seq_df(design_df, n_jobs=2)
aa_subseq_df = targetfeat.get_aa_subseq_df(sg_designs=design_targ_df, aa_seq_df=aa_seq_df, width=16,
                                           id_cols=id_cols)
aa_subseq_df
Getting amino acid sequences
100%|██████████| 4/4 [00:02<00:00,  1.47it/s]
/Users/peterdeweirdt/Documents/rs3/rs3/targetdata.py:111: UserWarning: Unable to find translations for the following transcripts: ENST00000629399, ENST00000284049
  warnings.warn('Unable to find translations for the following transcripts: ' + ', '.join(missing_seqs))
Target Transcript Target Total Length Transcript Base molecule id seq version desc AA len Target Cut Length Orientation sgRNA Context Sequence AA Index extended_seq AA 0-Indexed AA 0-Indexed padded seq_start seq_end AA Subsequence
0 ENST00000259457.8 834 ENST00000259457 protein ENSP00000259457 MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI... 3 None 277 191 sense TGGAGCAGATACAAGAGCAACTGAAGGGAT 64 -----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF... 63 80 64 96 GVVYKDGIVLGADTRATEGMVVADKNCSKIHFI
1 ENST00000259457.8 834 ENST00000259457 protein ENSP00000259457 MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI... 3 None 277 137 sense CCGGAAAACTGGCACGACCATCGCTGGGGT 46 -----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF... 45 62 46 78 AKRGYKLPKVRKTGTTIAGVVYKDGIVLGADTR
2 ENST00000394249.8 1863 ENST00000394249 protein ENSP00000377793 MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH... 3 None 620 316 sense TAGAAAAAGATTTGCGCACCCAAGTGGAAT 106 -----------------MRRSEVLAEESIVCLQKALNHLREIWELI... 105 122 106 138 EEGETTILQLEKDLRTQVELMRKQKKERKQELK
3 ENST00000394249.8 1863 ENST00000394249 protein ENSP00000377793 MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH... 3 None 620 787 antisense TGGCCTTTGACCCAGACATAATGGTGGCCA 263 -----------------MRRSEVLAEESIVCLQKALNHLREIWELI... 262 279 263 295 WDRLQIPEEEREAVATIMSGSKAKVRKALQLEV
4 ENST00000361337.3 2298 ENST00000361337 protein ENSP00000354522 MSGDHLHNDSQIEADFRLNDSHKHKDKHKDREHRHKEHKKEKDREK... 2 None 765 420 antisense AAATACTCACTCATCCTCATCTCGAGGTCT 140 -----------------MSGDHLHNDSQIEADFRLNDSHKHKDKHK... 139 156 140 172 GYFVPPKEDIKPLKRPRDEDDADYKPKKIKTED
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
391 ENST00000454402.7 1023 ENST00000454402 protein ENSP00000408295 METSALKQQEQPAATKIRNLPWVEKYRPQTLNDLISHQDILSTIQK... 2 None 340 220 antisense TGTCTTTATATAGCTGTTTCGCACAGGCTA 74 -----------------METSALKQQEQPAATKIRNLPWVEKYRPQ... 73 90 74 106 LYGPPGTGKTSTILACAKQLYKDKEFGSMVLEL
392 ENST00000254998.3 423 ENST00000254998 protein ENSP00000254998 MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV... 2 None 140 79 sense TTGTCAATGTCTACTACACCACCATGGATA 27 -----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD... 26 43 27 59 DQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTA
393 ENST00000254998.3 423 ENST00000254998 protein ENSP00000254998 MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV... 2 None 140 115 sense GGCGTTTGCTGTCCCGCCTGTACATGGGCA 39 -----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD... 38 55 39 71 VYYTTMDKRRRLLSRLYMGTATLVWNGNAVSGQ
394 ENST00000381685.10 2067 ENST00000381685 protein ENSP00000371101 MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI... 5 None 688 776 antisense ACTAGCAATGGCTTATCAGATCGAAGGTCA 259 -----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK... 258 275 259 291 TMAVGTTTGQVLLYDLRSDKPLLVKDHQYGLPI
395 ENST00000381685.10 2067 ENST00000381685 protein ENSP00000371101 MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI... 5 None 688 322 sense AAATTTTGTCTGATGACTACTCAAAGGTAT 108 -----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK... 107 124 108 140 CLDSEVVTFEILSDDYSKIVFLHNDRYIEFHSQ

396 rows × 19 columns

domain_df = targetdata.build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=2)
domain_feature_df = targetfeat.get_protein_domain_features(design_targ_df, domain_df, sources=None,
                                                           id_cols=id_cols)
Getting protein domains
100%|██████████| 198/198 [01:17<00:00,  2.54it/s]
conservation_df = targetdata.build_conservation_df(design_df, n_jobs=max_n_jobs)
conservation_feature_df = targetfeat.get_conservation_features(design_targ_df, conservation_df,
                                                             small_width=2, large_width=16,
                                                             conservation_column='ranked_conservation',
                                                             id_cols=id_cols)
conservation_feature_df
Getting conservation
100%|██████████| 200/200 [04:30<00:00,  1.35s/it]
/Users/peterdeweirdt/Documents/rs3/rs3/targetdata.py:346: UserWarning: Failed to get conservation scores for 2 transcriptsENST00000629399, ENST00000284049
  warnings.warn('Failed to get conservation scores for ' + str(len(failed_list)) +
sgRNA Context Sequence Target Cut Length Target Transcript Orientation cons_4 cons_32
0 AAAAGAATGATGAAAAGACACCACAGGGAG 244 ENST00000610426.5 sense 0.218231 0.408844
1 AAAAGAGCCATGAATCTAAACATCAGGAAT 640 ENST00000223073.6 sense 0.129825 0.278180
2 AAAAGCGCCAAATGGCCCGAGAATTGGGAG 709 ENST00000331923.9 sense 0.470906 0.532305
3 AAACAGAAAAAGTTAAAATCACCAAGGTGT 496 ENST00000283882.4 sense 0.580556 0.602708
4 AAACAGATGGAAGATGCTTACCGGGGGACC 132 ENST00000393047.8 sense 0.283447 0.414293
... ... ... ... ... ... ...
391 TTTGATTGCATTAAGGTTGGACTCTGGATT 246 ENST00000249269.9 sense 0.580612 0.618707
392 TTTGCCCACAGCTCCAAAGCATCGCGGAGA 130 ENST00000227618.8 sense 0.323770 0.416368
393 TTTTACAGTGCGATGTATGATGTATGGCTT 119 ENST00000338366.6 sense 0.788000 0.537417
394 TTTTGGATCTCGTAGTGATTCAAGAGGGAA 233 ENST00000629496.3 sense 0.239630 0.347615
395 TTTTTGTTACTACAGGTTCGCTGCTGGGAA 201 ENST00000395840.6 sense 0.693767 0.639044

396 rows × 6 columns

predictions = predict_target(design_df=design_df,
                             aa_subseq_df=aa_subseq_df,
                             domain_feature_df=domain_feature_df,
                             conservation_feature_df=conservation_feature_df)
design_df['Target Score'] = predictions
/Users/peterdeweirdt/miniforge3/envs/test_rs3_v6/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 1.0.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
  warnings.warn(
/Users/peterdeweirdt/miniforge3/envs/test_rs3_v6/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 1.0.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
  warnings.warn(
/Users/peterdeweirdt/miniforge3/envs/test_rs3_v6/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SimpleImputer was fitted without feature names
  warnings.warn(
lite_predictions = predict_target(design_df=design_df,
                                  aa_subseq_df=aa_subseq_df)
design_df['Target Score Lite'] = lite_predictions
/Users/peterdeweirdt/miniforge3/envs/test_rs3_v6/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 1.0.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
  warnings.warn(
/Users/peterdeweirdt/miniforge3/envs/test_rs3_v6/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 1.0.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
  warnings.warn(
/Users/peterdeweirdt/miniforge3/envs/test_rs3_v6/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SimpleImputer was fitted without feature names
  warnings.warn(
assert stats.pearsonr(design_df['Target Score'], design_df['Target Score Lite'])[0] > 0.5
sanger_df = pd.read_csv('test_data/Behan2019_activity.csv')
gecko_df = pd.read_csv('test_data/Aguirre2016_activity.csv')

sanger_designs = sanger_df.merge(design_df, how='inner',
                                 on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',
                                     'Target Cut %'])
gecko_designs = gecko_df.merge(design_df, how='inner',
                                on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',
                                    'Target Cut %'])
assert stats.pearsonr(sanger_designs['avg_mean_centered_neg_lfc'],
                      sanger_designs['Target Score'])[0] > 0.1