Snippets

Brian Ward py3_standardize_character_vectors

Updated by Brian Ward

File standardize_char_vec.py Modified

  • Ignore whitespace
  • Hide word diff
     ## Convert to uppercase, strip leading/trailing whitespace        
     str_list = [x.upper().strip() for x in str_list]
     
+    ## Replace emdashes with hyphens
+    str_list = [str.replace('--', '-') for x in str_list]
+    
+    ## Remove non-ASCII characters
+    ## HAVEN'T PERSONALLY TESTED THIS YET
+    ## Got solution from https://yuji.wordpress.com/2010/01/28/python-sql-remove-all-non-ascii-characters-from-string/
+    for s in str_list
+        s = ''.join([x for x in s if ord(x) < 128])
+    
     ## Swap out whitespace, dash, period, underscore for selected delimiter
     str_list = [re.sub('\s|\.|-|_', delim, x) for x in str_list]
     return str_list
Created by Brian Ward

File standardize_char_vec.py Added

  • Ignore whitespace
  • Hide word diff
+def stand_strings( str_list, delim ):
+    """Standardize capitalization and word delimiters for a list of strings.
+    
+    Args:
+        str_list (list): A list of of input strings.
+        delim (char): The desired output word delimiter - must be one of 
+        ' ' (i.e. space), '-', '_', or '.'
+        
+    Returns:
+        List of input strings, converted to uppercase with word delimiters
+        converted to the user's chosen delimiter.
+        
+    Depends:
+        Modules re and sys.
+    
+    Examples:
+        >>>import re
+        >>>import sys
+        >>>ny = [' New York City ', 'New-York_City', 'new.york.city']
+        >>>print(stand_strings(ny, "_"))
+        ['NEW_YORK_CITY', 'NEW_YORK_CITY', 'NEW_YORK_CITY']
+
+    """
+    
+    ## Check for appropriate word delimiter selection
+    if delim not in [' ', '-', '_', '.']:
+        sys.exit('''Please select one of the following for the output word delimiter:
+            " " (space), "-", "_", "." ''')
+    
+    ## Convert to uppercase, strip leading/trailing whitespace        
+    str_list = [x.upper().strip() for x in str_list]
+    
+    ## Swap out whitespace, dash, period, underscore for selected delimiter
+    str_list = [re.sub('\s|\.|-|_', delim, x) for x in str_list]
+    return str_list