Wrap all components of making rule set 3 predictions into one function
from rs3.targetdata import write_transcript_data, write_conservation_data
design_df = pd.read_table('test_data/sgrna-designs.txt')
sanger_activity = pd.read_csv('test_data/Behan2019_activity.csv')
gecko_activity = pd.read_csv('test_data/Aguirre2016_activity.csv')
import multiprocessing
max_n_jobs = multiprocessing.cpu_count()

predict_seq_tracr[source]

predict_seq_tracr(design_df, tracr, context_col, ref_tracrs, n_jobs)

combine_target_seq_scores[source]

combine_target_seq_scores(design_df, tracr, target_score_col, lite)

predict[source]

predict(design_df, tracr=None, target=False, aa_seq_file=None, domain_file=None, conservatin_file=None, id_cols=None, context_col='sgRNA Context Sequence', transcript_id_col='Target Transcript', transcript_base_col='Transcript Base', transcript_len_col='Target Total Length', n_jobs_min=1, n_jobs_max=1, lite=True)

Make predictions using RS3

:param design_df: DataFrame :param tracr: str or list :param target: bool, whether to include target scores :param aa_seq_file: str, path to precomputed amino acid sequences :param domain_file: str, path to precomputed domain file :param id_cols: list or None :param context_col: str :param transcript_id_col: str :param transcript_base_col: str :param transcript_len_col: str :param n_jobs_min: int :return: DataFram

scored_designs = predict(design_df, tracr=['Hsu2013', 'Chen2013'], target=True,
                         n_jobs_min=2, n_jobs_max=max_n_jobs,
                         lite=False)
Calculating sequence-based features
100%|██████████| 400/400 [00:00<00:00, 410.28it/s]
Calculating sequence-based features
100%|██████████| 400/400 [00:00<00:00, 4219.77it/s]
Getting amino acid sequences
100%|██████████| 4/4 [00:02<00:00,  1.34it/s]
/Users/peterdeweirdt/Documents/rs3/rs3/targetdata.py:111: UserWarning: Unable to find translations for the following transcripts: ENST00000629399, ENST00000284049
  warnings.warn('Unable to find translations for the following transcripts: ' + ', '.join(missing_seqs))
/var/folders/n9/c397fj3n5f16khr7tp76tbpc0000gn/T/ipykernel_38619/2834578737.py:67: UserWarning: Missing amino acid sequences for transcripts: ENST00000629399,ENST00000284049
  warnings.warn('Missing amino acid sequences for transcripts: ' +
Getting protein domains
100%|██████████| 198/198 [01:20<00:00,  2.46it/s]
Getting conservation
100%|██████████| 200/200 [04:44<00:00,  1.42s/it]
/Users/peterdeweirdt/Documents/rs3/rs3/targetdata.py:346: UserWarning: Failed to get conservation scores for 2 transcriptsENST00000629399, ENST00000284049
  warnings.warn('Failed to get conservation scores for ' + str(len(failed_list)) +
/var/folders/n9/c397fj3n5f16khr7tp76tbpc0000gn/T/ipykernel_38619/2834578737.py:92: UserWarning: Missing conservation scores for transcripts: ENST00000629399,ENST00000284049
  warnings.warn('Missing conservation scores for transcripts: ' +
/Users/peterdeweirdt/miniforge3/envs/test_rs3_v6/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 1.0.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
  warnings.warn(
/Users/peterdeweirdt/miniforge3/envs/test_rs3_v6/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 1.0.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
  warnings.warn(
/Users/peterdeweirdt/miniforge3/envs/test_rs3_v6/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SimpleImputer was fitted without feature names
  warnings.warn(
write_transcript_data(design_df, n_jobs=2,
                      filepath='./data/target_data/',
                      aa_seq_name='aa_seqs.pq',
                      protein_domain_name='protein_domains.pq')
Getting amino acid sequences
100%|██████████| 4/4 [00:03<00:00,  1.05it/s]
/Users/peterdeweirdt/Documents/rs3/rs3/targetdata.py:111: UserWarning: Unable to find translations for the following transcripts: ENST00000629399, ENST00000284049
  warnings.warn('Unable to find translations for the following transcripts: ' + ', '.join(missing_seqs))
Getting protein domains
100%|██████████| 198/198 [00:50<00:00,  3.94it/s]
write_conservation_data(design_df, n_jobs=max_n_jobs,
                        filepath='./data/target_data/',
                        cons_file_name='conservation.pq')
Getting conservation
100%|██████████| 200/200 [05:06<00:00,  1.53s/it]
/Users/peterdeweirdt/Documents/rs3/rs3/targetdata.py:346: UserWarning: Failed to get conservation scores for 2 transcriptsENST00000629399, ENST00000284049
  warnings.warn('Failed to get conservation scores for ' + str(len(failed_list)) +
scored_designs_stored = predict(design_df, tracr=['Hsu2013', 'Chen2013'], target=True,
                                n_jobs_min=2, n_jobs_max=max_n_jobs,
                                aa_seq_file='./data/target_data/aa_seqs.pq',
                                domain_file='./data/target_data/protein_domains.pq',
                                conservatin_file='./data/target_data/conservation.pq',
                                lite=False)
pd.testing.assert_frame_equal(scored_designs, scored_designs_stored)
Calculating sequence-based features
100%|██████████| 400/400 [00:00<00:00, 2050.58it/s]
Calculating sequence-based features
100%|██████████| 400/400 [00:00<00:00, 3244.59it/s]
/var/folders/n9/c397fj3n5f16khr7tp76tbpc0000gn/T/ipykernel_38619/2834578737.py:67: UserWarning: Missing amino acid sequences for transcripts: ENST00000629399,ENST00000284049
  warnings.warn('Missing amino acid sequences for transcripts: ' +
/var/folders/n9/c397fj3n5f16khr7tp76tbpc0000gn/T/ipykernel_38619/2834578737.py:92: UserWarning: Missing conservation scores for transcripts: ENST00000629399,ENST00000284049
  warnings.warn('Missing conservation scores for transcripts: ' +
/Users/peterdeweirdt/miniforge3/envs/test_rs3_v6/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 1.0.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
  warnings.warn(
/Users/peterdeweirdt/miniforge3/envs/test_rs3_v6/lib/python3.9/site-packages/sklearn/base.py:329: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 1.0.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
  warnings.warn(
/Users/peterdeweirdt/miniforge3/envs/test_rs3_v6/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but SimpleImputer was fitted without feature names
  warnings.warn(
activity_id_cols = ['sgRNA Sequence', 'sgRNA Context Sequence',
                    'Target Gene Symbol', 'Target Cut %']
sanger_actvity_scores = (sanger_activity.merge(scored_designs_stored,
                                               how='inner',
                                               on=activity_id_cols))
gecko_activity_scores = (gecko_activity.merge(scored_designs_stored,
                                              how='inner',
                                              on=activity_id_cols))
import seaborn as sns
activity_col = 'avg_mean_centered_neg_lfc'
score_cols = ['RS3 Sequence Score (Hsu2013 tracr)',
              'RS3 Sequence Score (Chen2013 tracr)',
              'Target Score',
              'RS3 Sequence (Hsu2013 tracr) + Target Score',
              'RS3 Sequence (Chen2013 tracr) + Target Score']
sanger_activity_cors = sanger_actvity_scores[[activity_col] + score_cols].corr()
sns.clustermap(sanger_activity_cors, annot=True, cmap='RdBu_r', vmin=-1, vmax=1)
<seaborn.matrix.ClusterGrid at 0x2912713a0>
No description has been provided for this image
assert (sanger_activity_cors.loc[activity_col, 'RS3 Sequence (Chen2013 tracr) + Target Score'] >
        sanger_activity_cors.loc[activity_col, 'RS3 Sequence (Hsu2013 tracr) + Target Score'] >
        sanger_activity_cors.loc[activity_col, 'RS3 Sequence Score (Chen2013 tracr)'] >
        sanger_activity_cors.loc[activity_col, 'RS3 Sequence Score (Hsu2013 tracr)'] >
        sanger_activity_cors.loc[activity_col, 'Target Score'])
gecko_activity_cors = gecko_activity_scores[[activity_col] + score_cols].corr()
sns.clustermap(gecko_activity_cors, annot=True, cmap='RdBu_r', vmin=-1, vmax=1)
<seaborn.matrix.ClusterGrid at 0x28a0b13d0>
No description has been provided for this image
assert (gecko_activity_cors.loc[activity_col, 'RS3 Sequence Score (Hsu2013 tracr)'] >
        gecko_activity_cors.loc[activity_col, 'RS3 Sequence (Hsu2013 tracr) + Target Score'] >
        gecko_activity_cors.loc[activity_col, 'RS3 Sequence Score (Chen2013 tracr)'] >
        gecko_activity_cors.loc[activity_col, 'RS3 Sequence (Chen2013 tracr) + Target Score'] >
        gecko_activity_cors.loc[activity_col, 'Target Score'])