Commits

Anonymous committed 2886403

Lots of changes in order to get the events to occur in just the right spots.

Comments (0)

Files changed (1)

 %%{
 machine opensmiles;
 
-# For the '*' in C*O
-star_atom = '*' >star_atom;
+# These atoms can be encoded in a single character
+single_letter_atom = (
+  '*' $star_atom |
 
-# Atoms in the "organic" subset don't need []s
-aliphatic_organic = (
-  'B' (
-    'r' >aliphatic_Br |
-    '' >aliphatic_B
-  ) |
-  'C' (
-    'l' >aliphatic_Cl |
-    '' >aliphatic_C
-  ) |
-  'N' >aliphatic_N |
-  'O' >aliphatic_O |
-  'S' >aliphatic_S |
-  'P' >aliphatic_P |
-  'F' >aliphatic_F |
-  'I' >aliphatic_I 
+  'N' $aliphatic_N |
+  'O' $aliphatic_O |
+  'S' $aliphatic_S |
+  'P' $aliphatic_P |
+  'F' $aliphatic_F |
+  'I' $aliphatic_I |
+
+  'b' $aromatic_b |
+  'c' $aromatic_c |
+  'n' $aromatic_n |
+  'o' $aromatic_o |
+  's' $aromatic_s |
+  'p' $aromatic_p 
+) $organic_atom $add_atom;
+after_B_atom = (
+  'r' $aliphatic_Br $organic_atom $add_atom
 );
-
-aromatic_organic = (
-  'b' >aromatic_b |
-  'c' >aromatic_c |
-  'n' >aromatic_n |
-  'o' >aromatic_o |
-  's' >aromatic_s |
-  'p' >aromatic_p 
+after_C_atom = (
+  'l' $aliphatic_Cl $organic_atom $add_atom
 );
 
 element_symbols = (
   'A' (
-    'c' >symbol_Ac |
-    'g' >symbol_Ag |
-    'l' >symbol_Al |
-    'm' >symbol_Am |
-    'r' >symbol_Ar |
-    's' >symbol_As |
-    't' >symbol_At |
-    'u' >symbol_Au 
+    'c' %symbol_Ac |
+    'g' %symbol_Ag |
+    'l' %symbol_Al |
+    'm' %symbol_Am |
+    'r' %symbol_Ar |
+    's' %symbol_As |
+    't' %symbol_At |
+    'u' %symbol_Au 
   ) |
 
   'B' (
-    'a' >symbol_Ba |
-    'e' >symbol_Be |
-    'h' >symbol_Bh |
-    'i' >symbol_Bi |
-    'k' >symbol_Bk |
-    'r' >symbol_Br |
-    '' >symbol_B 
+    'a' %symbol_Ba |
+    'e' %symbol_Be |
+    'h' %symbol_Bh |
+    'i' %symbol_Bi |
+    'k' %symbol_Bk |
+    'r' %symbol_Br |
+    '' %symbol_B 
   ) |
 
   'C' (
-    'a' >symbol_Ca |
-    'd' >symbol_Cd |
-    'e' >symbol_Ce |
-    'f' >symbol_Cf |
-    'l' >symbol_Cl |
-    'm' >symbol_Cm |
-    'o' >symbol_Co |
-    'r' >symbol_Cr |
-    's' >symbol_Cs |
-    'u' >symbol_Cu |
-    '' >symbol_C 
+    'a' %symbol_Ca |
+    'd' %symbol_Cd |
+    'e' %symbol_Ce |
+    'f' %symbol_Cf |
+    'l' %symbol_Cl |
+    'm' %symbol_Cm |
+    'o' %symbol_Co |
+    'r' %symbol_Cr |
+    's' %symbol_Cs |
+    'u' %symbol_Cu |
+    '' %symbol_C 
   ) |
 
   'D' (
-    'b' >symbol_Db |
-    's' >symbol_Ds |
-    'y' >symbol_Dy 
+    'b' %symbol_Db |
+    's' %symbol_Ds |
+    'y' %symbol_Dy 
   ) |
 
   'E' (
-    'r' >symbol_Er |
-    's' >symbol_Es |
-    'u' >symbol_Eu 
+    'r' %symbol_Er |
+    's' %symbol_Es |
+    'u' %symbol_Eu 
   ) |
 
   'F' (
-    'e' >symbol_Fe |
-    'm' >symbol_Fm |
-    'r' >symbol_Fr |
-    '' >symbol_F 
+    'e' %symbol_Fe |
+    'm' %symbol_Fm |
+    'r' %symbol_Fr |
+    '' %symbol_F 
   ) |
 
  'G' (
-    'a' >symbol_Ga |
-    'd' >symbol_Gd |
-    'e' >symbol_Ge 
+    'a' %symbol_Ga |
+    'd' %symbol_Gd |
+    'e' %symbol_Ge 
   ) | 
 
   'H' (
-    'e' >symbol_He |
-    'f' >symbol_Hf |
-    'g' >symbol_Hg |
-    'o' >symbol_Ho |
-    's' >symbol_Hs |
-    '' >symbol_H 
+    'e' %symbol_He |
+    'f' %symbol_Hf |
+    'g' %symbol_Hg |
+    'o' %symbol_Ho |
+    's' %symbol_Hs |
+    '' %symbol_H 
   ) |
 
   'I' (
-    'n' >symbol_In |
-    'r' >symbol_Ir |
-    '' >symbol_I 
+    'n' %symbol_In |
+    'r' %symbol_Ir |
+    '' %symbol_I 
   ) |
 
   'K' (
-    'r' >symbol_Kr |
-    '' >symbol_K 
+    'r' %symbol_Kr |
+    '' %symbol_K 
   ) |
 
   'L' (
-    'a' >symbol_La |
-    'i' >symbol_Li |
-    'r' >symbol_Lr |
-    'u' >symbol_Lu 
+    'a' %symbol_La |
+    'i' %symbol_Li |
+    'r' %symbol_Lr |
+    'u' %symbol_Lu 
   ) |
 
   'M' (
-    'd' >symbol_Md |
-    'g' >symbol_Mg |
-    'n' >symbol_Mn |
-    'o' >symbol_Mo |
-    't' >symbol_Mt 
+    'd' %symbol_Md |
+    'g' %symbol_Mg |
+    'n' %symbol_Mn |
+    'o' %symbol_Mo |
+    't' %symbol_Mt 
   ) |
 
   'N' (
-    'a' >symbol_Na |
-    'b' >symbol_Nb |
-    'd' >symbol_Nd |
-    'e' >symbol_Ne |
-    'i' >symbol_Ni |
-    'o' >symbol_No |
-    'p' >symbol_Np |
-    ''  >symbol_N 
+    'a' %symbol_Na |
+    'b' %symbol_Nb |
+    'd' %symbol_Nd |
+    'e' %symbol_Ne |
+    'i' %symbol_Ni |
+    'o' %symbol_No |
+    'p' %symbol_Np |
+    ''  %symbol_N 
   ) |
 
   'O' (
-    's' >symbol_Os |
-    '' >symbol_O 
+    's' %symbol_Os |
+    ''  %symbol_O 
   ) |
 
   'P' (
-    'a' >symbol_Pa |
-    'b' >symbol_Pb |
-    'd' >symbol_Pd |
-    'm' >symbol_Pm |
-    'o' >symbol_Po |
-    'r' >symbol_Pr |
-    't' >symbol_Pt |
-    'u' >symbol_Pu |
-    '' >symbol_P 
+    'a' %symbol_Pa |
+    'b' %symbol_Pb |
+    'd' %symbol_Pd |
+    'm' %symbol_Pm |
+    'o' %symbol_Po |
+    'r' %symbol_Pr |
+    't' %symbol_Pt |
+    'u' %symbol_Pu |
+    ''  %symbol_P 
   ) |
 
   'R' (
-    'a' >symbol_Ra |
-    'b' >symbol_Rb |
-    'e' >symbol_Re |
-    'f' >symbol_Rf |
-    'g' >symbol_Rg |
-    'h' >symbol_Rh |
-    'n' >symbol_Rn |
-    'u' >symbol_Ru 
+    'a' %symbol_Ra |
+    'b' %symbol_Rb |
+    'e' %symbol_Re |
+    'f' %symbol_Rf |
+    'g' %symbol_Rg |
+    'h' %symbol_Rh |
+    'n' %symbol_Rn |
+    'u' %symbol_Ru 
   ) |
 
   'S' (
-    'b' >symbol_Sb |
-    'c' >symbol_Sc |
-    'e' >symbol_Se |
-    'g' >symbol_Sg |
-    'i' >symbol_Si |
-    'm' >symbol_Sm |
-    'n' >symbol_Sn |
-    'r' >symbol_Sr |
-    '' >symbol_S 
+    'b' %symbol_Sb |
+    'c' %symbol_Sc |
+    'e' %symbol_Se |
+    'g' %symbol_Sg |
+    'i' %symbol_Si |
+    'm' %symbol_Sm |
+    'n' %symbol_Sn |
+    'r' %symbol_Sr |
+    ''  %symbol_S 
   ) |
 
   'T' (
-    'a' >symbol_Ta |
-    'b' >symbol_Tb |
-    'c' >symbol_Tc |
-    'e' >symbol_Te |
-    'h' >symbol_Th |
-    'i' >symbol_Ti |
-    'l' >symbol_Tl |
-    'm' >symbol_Tm 
+    'a' %symbol_Ta |
+    'b' %symbol_Tb |
+    'c' %symbol_Tc |
+    'e' %symbol_Te |
+    'h' %symbol_Th |
+    'i' %symbol_Ti |
+    'l' %symbol_Tl |
+    'm' %symbol_Tm 
   ) |
 
-  'U'  >symbol_U |
+  'U'  %symbol_U |
 
-  'V' >symbol_V |
+  'V' %symbol_V |
 
-  'W'  >symbol_W |
-  ('X' 'e' >symbol_Xe ) |
+  'W'  %symbol_W |
+  ('X' 'e' %symbol_Xe ) |
 
   'Y' (
-    'b' >symbol_Yb |
-    '' >symbol_Y 
+    'b' %symbol_Yb |
+    ''  %symbol_Y 
   ) |
 
   'Z' (
-    'n' >symbol_Zn |
-    'r' >symbol_Zr 
+    'n' %symbol_Zn |
+    'r' %symbol_Zr 
   )
 );
 
 aromatic_symbols = (
-  'c' >symbol_c | 
-  'n' >symbol_n | 
-  'o' >symbol_o | 
-  'p' >symbol_p | 
+  'c' %symbol_c | 
+  'n' %symbol_n | 
+  'o' %symbol_o | 
+  'p' %symbol_p | 
   's' (
-    'e' >symbol_se |
-    '' >symbol_s 
+    'e' %symbol_se |
+    ''  %symbol_s 
   ) |
-  ('a' 's' >symbol_as )
+  ('a' 's' %symbol_as )
 );
 
 # This is for the star in '[*]'
-star_symbol = '*' >symbol_star;
+star_symbol = '*' %symbol_star;
 
 symbol = (element_symbols | aromatic_symbols | star_symbol);
 
 isotope = ( digit >first_isotope_digit digit* $next_isotope_digit %end_isotope );
 
-chiral_range_30 = ( digit >first_chiral_digit (digit >second_chiral_digit)?);
+# I want to have better error messages should someone try "31" or "99".
+chiral_range_30 = ( ([12] >first_chiral_digit ([0-9] >second_chiral_digit)?) |
+		    ('3' >first_chiral_digit ('0' >second_chiral_digit)?) |
+		    ([456789] >first_chiral_digit) |
+		    ('0' >first_chiral_digit ([0-9] >second_chiral_digit)? ) );
 
 chiral = (
   '@' $err(chiral_cw) (
     '@' >chiral_ccw |
     'T' (
-      'H' [12] >chiral_TH >err(chiral_TH_error) |
-      'B' chiral_range_30 %chiral_TB %err(chiral_TB_error)
+      'H' >chiral_TH [12] >first_chiral_digit >err(chiral_TH_error) |
+      'B' >chiral_TB chiral_range_30 >first_chiral_digit %err(chiral_TB_error)
     ) |
-    'AL' [12] >chiral_AL >err(chiral_AL) |
-    'SP' [123] >chiral_SP >err(chiral_SP) |
-    'OH' chiral_range_30 >chiral_OH %err(chiral_OH_error) |
+    ('A' 'L' >chiral_AL [12] >first_chiral_digit >err(chiral_AL_error)) |
+    ('S' 'P' >chiral_SP [123] >first_chiral_digit >err(chiral_SP_error)) |
+    ('O' 'H' >chiral_OH chiral_range_30 %err(chiral_OH_error)) |
     '' >chiral_cw
   )
 );
   )
 );
 
-charge = (
-  '+' %err(plus_1_charge) (
-    '' >plus_1_charge |
-    '+' >plus_2_charge |
-    (digit >first_charge_digit digit* $next_charge_digit %end_positive_charge %err(end_positive_charge))
-  ) |
-  '-' %err(minus_1_charge) (
-    '' >minus_1_charge |
-    '-' >minus_2_charge |
-    (digit >first_charge_digit digit* $next_charge_digit %end_negative_charge %err(end_negative_charge))
-  )
-);
 
 class = (
-  ':' (digit >first_class_digit digit* $next_class_digit %end_class %err(end_class))
+  ':' %err(expecting_class_digit_error) digit >first_class_digit
+  digit* $next_class_digit %end_class
 );
+ 
+bracket = '[';
+end_bracket = ']' $bracket_atom $add_atom;
 
-organic_atom = (aliphatic_organic | aromatic_organic | star_atom) %from(organic_atom);
-bracket_atom = '[' isotope? symbol chiral? hcount? charge? class? ']' >bracket_atom;
-
-
-atom = (bracket_atom | organic_atom) %from(add_atom);
 
 bond = (
-  '-' >single_bond |
-  '=' >double_bond |
-  '#' >triple_bond |
-  '$' >quadruple_bond |
-  ':' >aromatic_bond |
-  '/' >forward_bond |
-  '\\' >backward_bond
-   # Does not include: '' >single_or_aromatic_bond
+  '-' $single_bond |
+  '=' $double_bond |
+  '#' $triple_bond |
+  '$' $quadruple_bond |
+  ':' $aromatic_bond |
+  '/' $forward_bond |
+  '\\' $backward_bond
+   # Does not include: '' >implicit_bond
 ) >add_bond;
 
+charge_count = digit >first_charge_digit digit* $next_charge_digit;
 
-dot = '.' >dot_disconnect;
+dot = '.' $dot_disconnect;
 
-open_branch = '(' >open_branch;
-close_branch = ')' >close_branch;
+open_branch = '(' $open_branch;
+close_branch = ')' $close_branch;
 
 closure = (
-  digit >single_digit_closure |
-  '%' %err(incomplete_closure_error) digit %err(incomplete_closure_error) digit >two_digit_closure %err(two_digit_closure)
-) >closure;
+  digit $single_digit_closure $closure |
+  '%' %err(incomplete_closure_error)
+  digit %err(incomplete_closure_error)
+  digit >two_digit_closure %err(two_digit_closure) >closure
+) ;
 
-whitespace_end = (' ' | '\t' | '\n' | '\r');
 action opensmiles_end_at_whitespace {
   fbreak;
 }
 
+whitespace = (' ' | '\t' | '\n' | '\r' | '\0') >end_molecule >opensmiles_end_at_whitespace;
+
 smiles_states = (
     start: (
-      atom -> parsed_atom |
-      whitespace_end >opensmiles_end_at_whitespace
+      (
+       'B' -> parsed_first_B |
+       'C' -> parsed_first_C |
+       single_letter_atom -> parsed_atom |
+       bracket -> parsed_bracket
+      ) >err(expecting_atom_error) |
+      whitespace
     ),
 
+    parsed_first_B: (
+      after_B_atom -> parsed_atom |
+      (
+       'B' -> parsed_first_B |
+       'C' -> parsed_first_C |
+       single_letter_atom -> parsed_atom |
+       closure -> parsed_closure |
+       bracket -> parsed_bracket
+      ) >aliphatic_B >organic_atom >add_atom >implicit_bond | (
+       bond -> parsed_bond |
+       close_branch -> parsed_close_branch |
+       open_branch -> parsed_open_branch |
+       dot -> parsed_dot |
+       whitespace
+      ) >aliphatic_B >organic_atom >add_atom
+    ),
+    parsed_first_C: (
+      after_C_atom -> parsed_atom |
+      (
+       'B' -> parsed_first_B |
+       'C' -> parsed_first_C |
+       single_letter_atom -> parsed_atom |
+       bracket -> parsed_bracket |
+       closure -> parsed_closure
+       ) >aliphatic_C >organic_atom >add_atom >implicit_bond | (
+       bond -> parsed_bond |
+       close_branch -> parsed_close_branch |
+       open_branch -> parsed_open_branch |
+       dot -> parsed_dot |
+       whitespace
+      ) >aliphatic_C >organic_atom >add_atom
+    ),
+
+    parsed_bracket: (
+      isotope -> parsed_isotope |
+      symbol >no_isotope -> parsed_symbol
+    ) >err(expecting_isotope_or_symbol_error),
+
+    parsed_isotope: (
+      symbol -> parsed_symbol
+    ) >err(expecting_symbol_error),
+    
+    parsed_symbol: (
+      chiral -> parsed_chiral |
+      hcount >no_chiral -> parsed_hcount |
+      '+' >no_chiral >no_hcount -> parsed_plus_charge |
+      '-' >no_chiral >no_hcount -> parsed_minus_charge |
+      class >no_chiral >no_hcount >no_charge -> parsed_class |
+      end_bracket >no_chiral >no_hcount >no_charge >no_class -> parsed_atom
+    ) >err(expecting_after_symbol_error),
+
+    parsed_chiral: (
+      hcount -> parsed_hcount |
+      '+' >no_hcount -> parsed_plus_charge |
+      '-' >no_hcount -> parsed_minus_charge |
+      class >no_hcount >no_charge -> parsed_class |
+      end_bracket >no_hcount >no_charge >no_class -> parsed_atom
+    ) >err(expecting_after_chiral_error),
+
+    parsed_hcount: (
+      '+' -> parsed_plus_charge |
+      '-' -> parsed_minus_charge |
+      class >no_charge -> parsed_class |
+      end_bracket >no_charge >no_class -> parsed_atom
+    )  >err(expecting_after_hcount_error),
+
+    parsed_plus_charge: (
+      '+' >plus_2_charge -> parsed_charge |
+      charge_count %end_positive_count -> parsed_charge |
+      class >plus_1_charge -> parsed_class |
+      end_bracket >plus_1_charge -> parsed_atom
+    ) >err(expecting_after_plus_charge_error),
+    
+    parsed_minus_charge: (
+      '-' >minus_2_charge -> parsed_charge |
+      charge_count %end_negative_count -> parsed_charge |
+      class >minus_1_charge -> parsed_class |
+      end_bracket >minus_1_charge -> parsed_atom
+    ) >err(expecting_after_minus_charge_error),
+
+    parsed_charge: (
+      class -> parsed_class |
+      end_bracket >no_class -> parsed_atom
+    ) >err(expecting_after_charge_error),
+
+    parsed_class: (
+      end_bracket -> parsed_atom
+    ) >err(expecting_after_class_error),
+
     # CC, C=C, C(C)C, C(C)C, C.C, C1CCC1
     parsed_atom: (
-      atom >single_or_aromatic_bond >add_bond -> parsed_atom |
+      (
+       'B' -> parsed_first_B |
+       'C' -> parsed_first_C |
+       single_letter_atom -> parsed_atom |
+       bracket -> parsed_bracket |
+       closure -> parsed_closure
+      ) >implicit_bond |
       bond -> parsed_bond |
       close_branch -> parsed_close_branch |
       open_branch -> parsed_open_branch |
       dot -> parsed_dot |
-      closure -> parsed_closure |
-      whitespace_end >opensmiles_end_at_whitespace
+      whitespace
     ),
 
     # C=C, C=1CCC=1
     parsed_bond: (
-      atom -> parsed_atom |
+      (
+       'B' -> parsed_first_B |
+       'C' -> parsed_first_C |
+       single_letter_atom -> parsed_atom |
+       bracket -> parsed_bracket
+      ) |
       closure -> parsed_closure
-    ),
-
+    ) >err(expecting_atom_or_closure_error),
+    
     # C(C)C, C(C)=C, C(C).C, C(C(C))C, C(C)(C)C
     parsed_close_branch: (
-      atom -> parsed_atom |
+      (
+       'B' -> parsed_first_B |
+       'C' -> parsed_first_C |
+       single_letter_atom -> parsed_atom |
+       bracket -> parsed_bracket
+      ) >implicit_bond |
       bond -> parsed_bond |
       close_branch -> parsed_close_branch |
       open_branch -> parsed_open_branch |
       dot -> parsed_dot |
-      whitespace_end >opensmiles_end_at_whitespace
+      whitespace
     ),
 
     # C(C), C(=C), C(.C) (really!) # XXX left out dot
     # Careful! Don't allow "C(3"
     parsed_open_branch: (
-      atom -> parsed_atom |
-      bond -> parsed_bond
+      (
+       (
+        'B' -> parsed_first_B |
+       	'C' -> parsed_first_C |
+       	single_letter_atom -> parsed_atom |
+       	bracket -> parsed_bracket
+       ) >implicit_bond |
+       bond -> parsed_bond
+      ) >err(expecting_atom_or_bond_error)
     ),
 
-    # C.C  -- allow a dot? as in C..C
+    # C.C  -- allow a dot? as in C..C. Not in OpenSMILES.
     parsed_dot: (
-      atom -> parsed_atom
+      (
+       'B' -> parsed_first_B |
+       'C' -> parsed_first_C |
+       single_letter_atom -> parsed_atom |
+       bracket -> parsed_bracket
+      )
     ),
 
     # C1CCC1, C1=CCC1, C12CC1C2, C1C(CC)1, C1(CC)CC1, c1ccccc1.[NH4+]
     parsed_closure: (
-      atom -> parsed_atom |
-      bond -> parsed_closure |
-      close_branch -> parsed_close_branch |
-      open_branch -> parsed_open_branch |
-      dot -> parsed_dot |
-      closure -> parsed_closure |
-      whitespace_end >opensmiles_end_at_whitespace
+      (
+       'B' -> parsed_first_B |
+       'C' -> parsed_first_C |
+       single_letter_atom -> parsed_atom |
+       bracket -> parsed_bracket |
+       closure -> parsed_closure
+      ) > implicit_bond |
+      (
+       bond -> parsed_bond |
+       close_branch -> parsed_close_branch |
+       open_branch -> parsed_open_branch |
+       dot -> parsed_dot |
+       whitespace
+      )
     )
   );