Snippets

Brian Ward py3_KASP_primer_format_for_IDT_ordering

Created by Brian Ward last modified
#!/usr/bin/env python
# -*- coding: utf-8 -*-

''' 
@author: Brian Ward

This script is designed to take a text file containing some KASP primer sequences
and their names, and formats them for the IDT oligo ordering form. To do this,
it needs to perform two iterations:
    1) Select lines from the text file matching patterns - these will contain
       labels for the alternative allele primers (_AL), and the common primers
       (_C1). Each of these lines contains one tab delimiter separating the 
       primer name and the primer sequence
    2) After splitting these lines on the tab delimiter and creating a dataframe,
       it needs to search through the primer sequences for "tag" sequences that
       will have FAM and VIC dye attachments. For each of these, it then adds
       "_FAM" or "_VIC" to the end of the primer name, respectively.
'''

#### User-defined variables ####

## Location of input data in relation to home directory - needs leading '/' !
data_dir = '/Downloads'

## Input text file and output .csv file
in_file = 'Carpenter_KASP_design_5-16-17.txt'
out_file = 'Carpenter_KASP_primers_5-17-17.csv'

## Patterns identifying text file lines to read
keep_lines = ['_AL', '_C1']

## FAM and VIC tag sequences within primers
fam_seq = 'GAAGGTGACCAAGTTCATGC'
vic_seq = 'GAAGGTCGGAGTCAACGGATT'


#### Executable ####

import os
import pandas as pd

## Set working directory
os.chdir(os.path.normpath(os.path.expanduser('~') + data_dir))

## Pull out lines matching any pattern in keep_lines
## Note that trailing newline characters must be stripped out
## Notes on pandas Series.append(): 
##   1) unlike normal str.append(), cannot edit in place (i.e. must reassign in loop)
##   2) only appends a series to a pre-existing series (must convert file line to
##      series with length 1 to append it)
yanked = pd.Series()
with open(in_file) as file:
    for line in file:
        for pattern in keep_lines:
            if pattern in line:
                yanked = yanked.append(pd.Series(line.strip()))
                
## Split the series into a two-column dataframe
## Then add some constant columns
primer_df = yanked.str.split('\t', expand = True)
primer_df.columns = ['Primer', 'Sequence']  
primer_df['Scale'] = '25nm'   
primer_df['Purification'] = 'STD'

## Now, need to find each primer containing the FAM and VIC sequences
## and add "_FAM" and "_VIC" to the end of their names
## This performs a row-wise Boolean operation to identify matching sequences
primer_df.loc[primer_df['Sequence'].str.contains(fam_seq), 'Primer'] = primer_df['Primer'] + '_FAM'
primer_df.loc[primer_df['Sequence'].str.contains(vic_seq), 'Primer'] = primer_df['Primer'] + '_VIC'

## Write out the table
primer_df.to_csv(out_file, index = False)

Comments (0)

HTTPS SSH

You can clone a snippet to your computer for local editing. Learn more.