Wrap all components of making rule set 3 predictions into one function
from rs3.targetdata import write_transcript_data, write_conservation_data
design_df = pd.read_table('test_data/sgrna-designs.txt')
sanger_activity = pd.read_csv('test_data/Behan2019_activity.csv')
gecko_activity = pd.read_csv('test_data/Aguirre2016_activity.csv')
import multiprocessing
max_n_jobs = multiprocessing.cpu_count()

predict_seq_tracr[source]

predict_seq_tracr(design_df, tracr, context_col, ref_tracrs, n_jobs)

combine_target_seq_scores[source]

combine_target_seq_scores(design_df, tracr, target_score_col, lite)

predict[source]

predict(design_df, tracr=None, target=False, aa_seq_file=None, domain_file=None, conservatin_file=None, id_cols=None, context_col='sgRNA Context Sequence', transcript_id_col='Target Transcript', transcript_base_col='Transcript Base', transcript_len_col='Target Total Length', n_jobs_min=1, n_jobs_max=1, lite=True)

Make predictions using RS3

:param design_df: DataFrame :param tracr: str or list :param target: bool, whether to include target scores :param aa_seq_file: str, path to precomputed amino acid sequences :param domain_file: str, path to precomputed domain file :param id_cols: list or None :param context_col: str :param transcript_id_col: str :param transcript_base_col: str :param transcript_len_col: str :param n_jobs_min: int :return: DataFram

scored_designs = predict(design_df, tracr=['Hsu2013', 'Chen2013'], target=True,
                         n_jobs_min=2, n_jobs_max=max_n_jobs,
                         lite=False)
Calculating sequence-based features
100%|██████████| 400/400 [00:04<00:00, 85.01it/s] 
Calculating sequence-based features
100%|██████████| 400/400 [00:00<00:00, 2471.62it/s]
Getting amino acid sequences
100%|██████████| 4/4 [00:08<00:00,  2.25s/it]
Getting protein domains
100%|██████████| 200/200 [00:59<00:00,  3.35it/s]
Getting conservation
100%|██████████| 200/200 [03:36<00:00,  1.08s/it]
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
  warnings.warn(
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
  warnings.warn(
write_transcript_data(design_df, n_jobs=2,
                      filepath='./data/target_data/',
                      aa_seq_name='aa_seqs.pq',
                      protein_domain_name='protein_domains.pq')
Getting amino acid sequences
100%|██████████| 4/4 [00:05<00:00,  1.31s/it]
Getting protein domains
100%|██████████| 200/200 [01:00<00:00,  3.30it/s]
write_conservation_data(design_df, n_jobs=max_n_jobs,
                        filepath='./data/target_data/',
                        cons_file_name='conservation.pq')
Getting conservation
100%|██████████| 200/200 [03:35<00:00,  1.08s/it]
scored_designs_stored = predict(design_df, tracr=['Hsu2013', 'Chen2013'], target=True,
                                n_jobs_min=2, n_jobs_max=max_n_jobs,
                                aa_seq_file='./data/target_data/aa_seqs.pq',
                                domain_file='./data/target_data/protein_domains.pq',
                                conservatin_file='./data/target_data/conservation.pq',
                                lite=False)
pd.testing.assert_frame_equal(scored_designs, scored_designs_stored)
Calculating sequence-based features
100%|██████████| 400/400 [00:00<00:00, 628.94it/s]
Calculating sequence-based features
100%|██████████| 400/400 [00:00<00:00, 2256.05it/s]
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
  warnings.warn(
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
  warnings.warn(
activity_id_cols = ['sgRNA Sequence', 'sgRNA Context Sequence',
                    'Target Gene Symbol', 'Target Cut %']
sanger_actvity_scores = (sanger_activity.merge(scored_designs_stored,
                                               how='inner',
                                               on=activity_id_cols))
gecko_activity_scores = (gecko_activity.merge(scored_designs_stored,
                                              how='inner',
                                              on=activity_id_cols))
import seaborn as sns
activity_col = 'avg_mean_centered_neg_lfc'
score_cols = ['RS3 Sequence Score (Hsu2013 tracr)',
              'RS3 Sequence Score (Chen2013 tracr)',
              'Target Score',
              'RS3 Sequence (Hsu2013 tracr) + Target Score',
              'RS3 Sequence (Chen2013 tracr) + Target Score']
sanger_activity_cors = sanger_actvity_scores[[activity_col] + score_cols].corr()
sns.clustermap(sanger_activity_cors, annot=True, cmap='RdBu_r', vmin=-1, vmax=1)
<seaborn.matrix.ClusterGrid at 0x7fc6d0223340>
assert (sanger_activity_cors.loc[activity_col, 'RS3 Sequence (Chen2013 tracr) + Target Score'] >
        sanger_activity_cors.loc[activity_col, 'RS3 Sequence (Hsu2013 tracr) + Target Score'] >
        sanger_activity_cors.loc[activity_col, 'RS3 Sequence Score (Chen2013 tracr)'] >
        sanger_activity_cors.loc[activity_col, 'RS3 Sequence Score (Hsu2013 tracr)'] >
        sanger_activity_cors.loc[activity_col, 'Target Score'])
gecko_activity_cors = gecko_activity_scores[[activity_col] + score_cols].corr()
sns.clustermap(gecko_activity_cors, annot=True, cmap='RdBu_r', vmin=-1, vmax=1)
<seaborn.matrix.ClusterGrid at 0x7fc6cfa528e0>
assert (gecko_activity_cors.loc[activity_col, 'RS3 Sequence (Hsu2013 tracr) + Target Score'] >
        gecko_activity_cors.loc[activity_col, 'RS3 Sequence Score (Hsu2013 tracr)'] >
        gecko_activity_cors.loc[activity_col, 'RS3 Sequence Score (Chen2013 tracr)'] >
        gecko_activity_cors.loc[activity_col, 'RS3 Sequence (Chen2013 tracr) + Target Score'] >
        gecko_activity_cors.loc[activity_col, 'Target Score'])