Commits

Michael Heemskerk committed 908a33d

CRUC-4793: Fix for JVM bug with unicode cmdline arguments

  • Participants
  • Parent commits 122363c

Comments (0)

Files changed (5)

File processutils/src/main/java/com/atlassian/utils/process/ExternalProcess.java

     	return System.getProperty("os.name").toLowerCase().contains("windows");
     }
     
+    /*
+      * This method provides a workaround for a JVM bug on windows (see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4947220). The bug
+      * is that the Sun/Oracle JVM uses the (8 bit) OEM codepage for encoding commandline arguments that are passed to an external process.  Any
+      * characters in a command line argument that can't be represented in the OEM codepage will be replaced by the '?' character, which will probably
+      * cause the command that's being called to fail.
+      * 
+      * A bit of background information is helpful to understand what's going on. Windows uses 2 code pages: the OEM code page and the ANSI code page 
+      * (or 'Windows code page'). The OEM code page is always limited to 8 bit character encodings, whereas some windows code pages are 8 bit and some
+      * are larger. The OEM code page is typically used for console applications, whereas the windows code pages are used for native non-Unicode application
+      * using a GUI on Windows systems.  The system-wide settings can be found in the windows registry:
+      * 
+      * - HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\Nls\CodePage\OEMCP defines which OEM code page is used
+      * - HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\Nls\CodePage\ACP defines which ANSI code page is used
+      * 
+      * More info about the history of OEM vs ANSI code pages: http://blogs.msdn.com/b/michkap/archive/2005/02/08/369197.aspx
+      * 
+      * The workaround is to store the command-line arguments in environment variables and refer to the environment vars on the command line. Windows
+      * cmd will expand the command line arguments without using to the OEM code page, which usually has more correlation with the locale of the 
+      * producer of the hardware (it's derived from the BIOS code page) than with the regional settings configured in Windows. The ANSI code page is derived from
+      * the regional settings.
+      * 
+      * When cmd expands the %JENV_XXX% vars on the command line it uses the ANSI code page instead of the OEM code page (or that is what testing
+      * seems to indicate, can't find any definitive answer in the cmd docos). While this still isn't a true fix to the problem, for most situations this will be sufficient 
+      * as the user typically won't use characters that aren't defined for his locale. But then again, they might..
+     */
     private Process createWinProcess(String[] cmdArray, String[] environment, File workingDir) throws IOException {
-        // workaround for JVM bug on windows. See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4947220 for details
         Map<String, String> newEnv = new HashMap<String, String>();
         if (environment == null) {
             // inherit the environment of the current process
                 }
             }
         }
-        String[] i18n = new String[cmdArray.length + 3];
+        int extraArgs = 2;
+        String[] i18n = new String[cmdArray.length + extraArgs];
         i18n[0] = "cmd";
         i18n[1] = "/C";
-        i18n[2] = "/u";
-        i18n[3] = cmdArray[0];
-        for (int counter = 2; counter < cmdArray.length; counter++) {
+        i18n[extraArgs] = cmdArray[0];
+        for (int counter = 1; counter < cmdArray.length; counter++) {
             String envName = "JENV_" + counter;
-            i18n[counter + 3] = "%" + envName + "%";
+            i18n[counter + extraArgs] = "%" + envName + "%";
             newEnv.put(envName, cmdArray[counter]);
         }
         cmdArray = i18n;

File processutils/src/test/java/com/atlassian/utils/process/CallEcho.java

 import java.util.Properties;
 
 public class CallEcho {
-    /**
+	
+	  static public String byteToHex(byte b) {
+		    // Returns hex String representation of byte b
+		    char hexDigit[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+		        'a', 'b', 'c', 'd', 'e', 'f' };
+		    char[] array = { hexDigit[(b >> 4) & 0x0f], hexDigit[b & 0x0f] };
+		    return new String(array);
+		  }
+	  
+	  static public String charToHex(char c) {
+		    // Returns hex String representation of char c
+		    byte hi = (byte) (c >>> 8);
+		    byte lo = (byte) (c & 0xff);
+		    return byteToHex(hi) + byteToHex(lo);
+		  }
+	  
+	  /**
      * Helper method to print out a string as a list of bytes (integers)
      * @param value
      * @return
         }
         StringBuilder builder = new StringBuilder();
         String sep = "";
-        for (byte b : value.getBytes("UTF-8")) {
-            builder.append(sep).append(b);
+        for (int i = 0 ; i < value.length() ; ++i) {
+            builder.append(sep).append(charToHex(value.charAt(i)));
             sep = " ";
         }
         return builder.toString();
     }
     
-    
     /**
      * @param args
      */
         final String echo = echoString.toString();
         if (!echo.isEmpty()) {
             final String[] result = new String[1];
-            List<String> echoCmd = Arrays.asList("echo", echo);
+            List<String> echoCmd = Arrays.asList("java", "-Dfile.encoding="+System.getProperty("file.encoding"), "com.atlassian.utils.process.Echo", echo);
             // the command line arguments will be converted using the default encoding (file.encoding). The lineoutputhandler needs to use the same encoding
             // to process the results. Otherwise, encoding errors will occur.
-            ExternalProcess process = new ExternalProcessBuilder().command(echoCmd).handlers(new LineOutputHandler(System.getProperty("file.encoding")) {
+            ExternalProcess process = new ExternalProcessBuilder()
+            	.command(echoCmd)
+            	.timeout(250)
+            	.handlers(new LineOutputHandler(System.getProperty("file.encoding")) {
                 @Override
                 protected void processLine(int lineNum, String line) {
                     result[0] = line;
                 }
-            }).build();
-            process.setTimeout(250);
+            }, new LineOutputHandler() {
+				@Override
+				protected void processLine(int lineNum, String line) {
+					System.err.println(line);
+				}
+			}).build();
+            process.setEnvironment(new String[] {"CLASSPATH=" + System.getProperty("java.class.path")});
             process.execute();
             
             // output the expected and actual strings as a list of bytes to prevent encodings to mess up the results
-            System.out.println(System.getProperty("file.encoding"));
             System.out.println(getBytes(echo));
             System.out.println(getBytes(result[0]));
         }

File processutils/src/test/java/com/atlassian/utils/process/Echo.java

+package com.atlassian.utils.process;
+
+public class Echo {
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		String sep = "";
+		for (String str : args) {
+			System.out.println(sep + str);
+			sep = " ";
+		}
+	}
+
+}

File processutils/src/test/java/com/atlassian/utils/process/ExternalProcessTest.java

     private static class EchoResult {
         public String input;
         public String output;
-        public String reportedEncoding;
     }
     
     /*
         try {
             reader = new BufferedReader(new InputStreamReader(process.getInputStream()));
             EchoResult result = new EchoResult();
-            result.reportedEncoding = reader.readLine();
             result.input = reader.readLine();
             result.output = reader.readLine();
             
         assertFalse(message, (value1 == null && value2 == null) || (value1 != null && value1.equals(value2)));
     }
     
+    protected boolean isWindows() {
+    	return System.getProperty("os.name").toLowerCase().contains("windows");
+    }
+    
     @Test
     public void testIncompatibleCommandLineArgumentsASCII() throws Exception {
         EchoResult result = spawnEcho("ASCII", "iso-8859-1");
-        assertNotEquals("ACII - incompatibility expected", result.input, result.output);
+        assertNotEquals("ASCII - incompatibility expected", result.input, result.output);
 
         result = spawnEcho("ASCII", "unicode-chinese");
-        assertNotEquals("ACII - incompatibility expected", result.input, result.output);
+        assertNotEquals("ASCII - incompatibility expected", result.input, result.output);
 }
 
+    /**
+     * This test will only succeed on windows if the ANSI code page that is defined is compatible with iso-8859-1 (e.g. cp1252)
+     * @see ExternalProcess#createWinProcess for more details
+     */
     @Test
     public void testCommandLineArgumentsUTF8() throws Exception {
-        EchoResult result = spawnEcho("UTF-8", "iso-8859-1", "win-1252-not-in-iso-8859-1", "unicode-chinese", "latin-ext-A", "latin-ext-B");
-        assertEquals("UTF-8 - no incompatibility expected", result.input, result.output);
+        EchoResult result = spawnEcho("UTF-8",  "iso-8859-1");
+        assertEquals("UTF-8 - incompatibility detected:", result.input, result.output);
+    }
+    
+    /**
+     * This test is expected to fail on windows (assuming the ANSI CP is cp1252) because on windows that code page
+     * is used to encode the commandline arguments.
+     */
+    @Test
+    public void testCommandLineArgumentsUTF8IncompatibleWithCp1252() throws Exception {
+        EchoResult result = spawnEcho("UTF-8",  "unicode-chinese");
+        
+        if (isWindows()) {
+        	assertNotEquals("UTF-8 - incompatibility expected", result.input, result.output);
+        } else {
+        	assertEquals("UTF-8 - incompatibility detected", result.input, result.output);
+        }
     }
     
     @Test
     public void testCommandLineArgumentsWin1252() throws Exception {
         if (Charset.isSupported("windows-1252")) {
-            EchoResult result = spawnEcho("windows-1252", "iso-8859-1", "win-1252-not-in-iso-8859-1");
-            assertEquals("Windows cp 1252 - 8 - no incompatibility expected", result.input, result.output);
+            EchoResult result = spawnEcho("windows-1252", "iso-8859-1");
+            assertEquals("Windows cp 1252 -  incompatibility detected:", result.input, result.output);
+        }
+    }
+
+    @Test
+    public void testCommandLineArgumentsWin1252Iso88591Incompatibilities() throws Exception {
+        if (Charset.isSupported("windows-1252")) {
+            EchoResult result = spawnEcho("windows-1252", "win-1252-not-in-iso-8859-1");
+            assertEquals("Windows cp 1252 -  incompatibility detected:", result.input, result.output);
+        }
+    }
+
+    @Test
+    public void testCommandLineArgumentsWin437() throws Exception {
+        if (Charset.isSupported("windows-437")) {
+            EchoResult result = spawnEcho("windows-437", "windows-437");
+            assertEquals("Windows cp 437 - incompatibility detected:", result.input, result.output);
         }
     }
     
     @Test
     public void testCommandLineArgumentsIso88591() throws Exception {
-        EchoResult result = spawnEcho("windows-1252", "iso-8859-1");
-        assertEquals("ISO-8859-1 - no incompatibility expected", result.input, result.output);
+        EchoResult result = spawnEcho("iso-8859-1", "iso-8859-1");
+        assertEquals("ISO-8859-1 - incompatibility detected:", result.input, result.output);
     }
 }

File processutils/src/test/java/com/atlassian/utils/process/echostrings.properties

 # see http://www.iana.org/assignments/character-sets for canonical charsets
 
 # collection of iso-8859-1 unicode chars, see http://en.wikipedia.org/wiki/ISO/IEC_8859-1
-iso-8859-1=u00A0\u00A1\u00A2\u00A3\u00A4\u00A5\u00A6\u00A7\u00A8\u00A9\u00AA\u00AB\u00AC\u00AD\u00AE\u00AF\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5\u00E6
+iso-8859-1=\u00A0\u00A1\u00A2\u00A3\u00A4\u00A5\u00A6\u00A7\u00A8\u00A9\u00AA\u00AB\u00AC\u00AD\u00AE\u00AF\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5\u00E6
 
-# latin-extended
-latin-ext-A=\u0100\u0101\u0102\u0103\u0104\u0105\u0106\u0107\u0108
-latin-ext-B=\u0190\u0191\u0192\u0193\u0194\u0195\u0196\u0197\u0198
+# latin-extended-A
+latin-ext-A=\u0153\u0144\u0133
+latin-ext-B=\u0190\u0191\u0192\u0195
+
+#windows cp 437  (OEM-US) 129 130 131 which map to different unicode code points than UTF-8 or ISO-8859-1 
+win-437=\u00fc\u00f9\u00E2
 
 # collection of windows code page 1252 characters that have no corresponding counterpart in iso-8859-1
 win-1252-not-in-iso-8859-1=\u20AC\u201A\u0192\u0161\u0153\u0178