URLDecoder.java: Remerge with Classpath

* java/net/URLDecoder.java: Remerge with Classpath * java/net/URLEncoder.java: Merge with Classpath From-SVN: r46098
2001-10-08 21:03:34 +00:00 · 2001-10-08 21:03:34 +00:00 · 0bdf7869bf
commit 0bdf7869bf
parent 63dd08e60d
3 changed files with 233 additions and 71 deletions
--- a/libjava/ChangeLog
+++ b/libjava/ChangeLog
@ -1,3 +1,8 @@
+2001-10-07  Mark Wielaard  <mark@klomp.org>
+
+	* java/net/URLDecoder.java: Remerge with Classpath
+	* java/net/URLEncoder.java: Merge with Classpath
+
 2001-10-08  Tom Tromey  <tromey@redhat.com>

 	Fix for PR libgcj/4481:
--- a/libjava/java/net/URLDecoder.java
+++ b/libjava/java/net/URLDecoder.java
@ -28,56 +28,131 @@ package java.net;

 import java.io.UnsupportedEncodingException;

-/**
-  * This utility class contains one static method that converts a 
+ /**
+  * This utility class contains static methods that converts a 
  * string encoded in the x-www-form-urlencoded format to the original
-  * text.  The x-www-form-urlencoded format 
-  * replaces certain disallowed characters with
-  * encoded equivalents.  All upper case and lower case letters in the
-  * US alphabet remain as is, the space character (' ') is replaced with
-  * '+' sign, and all other characters are converted to a "%XX" format
-  * where XX is the hexadecimal representation of that character.  Note
-  * that since unicode characters are 16 bits, and this method encodes only
-  * 8 bits of information, the lower 8 bits of the character are used.
+  * text.  The x-www-form-urlencoded format replaces certain disallowed
+  * characters with encoded equivalents.  All upper case and lower case
+  * letters in the US alphabet remain as is, the space character (' ')
+  * is replaced with '+' sign, and all other characters are converted to a
+  * "%XX" format where XX is the hexadecimal representation of that character
+  * in a given character encoding (default is "UTF-8").
  * <p>
  * This method is very useful for decoding strings sent to CGI scripts
  *
-  * Written using on-line Java Platform 1.2 API Specification.
+  * Written using on-line Java Platform 1.2/1.4 API Specification.
  * Status:  Believed complete and correct.
  *
  * @since 1.2
  *
  * @author Warren Levy <warrenl@cygnus.com>
  * @author Aaron M. Renn (arenn@urbanophile.com) (documentation comments)
-  * @date April 22, 1999.
+  * @author Mark Wielaard (mark@klomp.org)
  */
 public class URLDecoder
 {
-/**
+ /**
  * This method translates the passed in string from x-www-form-urlencoded
-  * format and returns it.
+  * format using the default encoding "UTF-8" to decode the hex encoded
+  * unsafe characters.
  *
-  * @param source The String to convert
+  * @param s the String to convert
  *
-  * @return The converted String
+  * @return the converted String
  */
  public static String decode(String s)
  {
+    try
+      {
+	return decode(s, "UTF-8");
+      }
+    catch (UnsupportedEncodingException uee)
+      {
+	// Should never happen since UTF-8 encoding should always be supported
+	return s;
+      }
+  }
+
+ /**
+  * This method translates the passed in string from x-www-form-urlencoded
+  * format using the given character encoding to decode the hex encoded
+  * unsafe characters.
+  * <p>
+  * This implementation will decode the string even if it contains
+  * unsafe characters (characters that should have been encoded) or if the
+  * two characters following a % do not represent a hex encoded byte.
+  * In those cases the unsafe character or the % character will be added
+  * verbatim to the decoded result.
+  *
+  * @param s the String to convert
+  * @param encoding the character encoding to use the decode the hex encoded
+  *        unsafe characters
+  *
+  * @return the converted String
+  *
+  * @since 1.4
+  */
+  public static String decode(String s, String encoding)
+    throws UnsupportedEncodingException
+  {
+    StringBuffer result = new StringBuffer();
+
+    // First convert all '+' characters to spaces.
    String str = s.replace('+', ' ');
-    String result = "";
+    
+    // Then go through the whole string looking for byte encoded characters
    int i;
    int start = 0;
+    byte[] bytes = null;
+    int length = str.length();
    while ((i = str.indexOf('%', start)) >= 0)
      {
-	result = result + str.substring(start, i) +
-		 (char) Integer.parseInt(str.substring(i + 1, i + 3), 16);
-	start = i + 3;
+	// Add all non-encoded characters to the result buffer
+	result.append(str.substring(start, i));
+	start = i;
+
+	// Get all consecutive encoded bytes
+	while ((i+2 < length) && (str.charAt(i) == '%'))
+	  i += 3;
+
+	// Decode all these bytes
+	if ((bytes == null) || (bytes.length < ((i-start)/3)))
+	  bytes = new byte[((i-start)/3)];
+
+	int index = 0;
+	try
+	  {
+	    while (start < i)
+	      {
+		String sub = str.substring(start + 1, start + 3);
+		bytes[index] = (byte)Integer.parseInt(sub, 16);
+		index++;
+		start += 3;
+	      }
+	  }
+	catch (NumberFormatException nfe)
+	  {
+	    // One of the hex encoded strings was bad
+	  }
+
+	// Add the bytes as characters according to the given encoding
+	result.append(new String(bytes, 0, index, encoding));
+
+	// Make sure we skip to just after a % sign
+	// There might not have been enough encoded characters after the %
+	// or the hex chars were not actually hex chars (NumberFormatException)
+	if (start < length && s.charAt(start) == '%')
+	  {
+	    result.append('%');
+	    start++;
+	  }
      }

+    // Add any characters left
    if (start < str.length())
-      result = result + str.substring(start);
+      result.append(str.substring(start));

-    return result;
+    return result.toString();
  }
-} // class URLDecoder

+} // class URLDecoder
--- a/libjava/java/net/URLEncoder.java
+++ b/libjava/java/net/URLEncoder.java
@ -1,71 +1,153 @@
-// URLEncoder.java - Provides a method for encoding strings according to
-//		     application/x-www-form-urlencoded MIME type.
+/* URLEncoder.java -- Class to convert strings to a properly encoded URL
+   Copyright (C) 1998, 1999, 2001 Free Software Foundation, Inc.

-/* Copyright (C) 1999  Free Software Foundation
+This file is part of GNU Classpath.

-   This file is part of libgcj.
+GNU Classpath is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+ 
+GNU Classpath is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.

-This software is copyrighted work licensed under the terms of the
-Libgcj License.  Please consult the file "LIBGCJ_LICENSE" for
-details.  */
+You should have received a copy of the GNU General Public License
+along with GNU Classpath; see the file COPYING.  If not, write to the
+Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+02111-1307 USA.
+
+As a special exception, if you link this library with other files to
+produce an executable, this library does not by itself cause the
+resulting executable to be covered by the GNU General Public License.
+This exception does not however invalidate any other reasons why the
+executable file might be covered by the GNU General Public License. */

 package java.net;
+
 import java.io.UnsupportedEncodingException;

 /**
- * @author Warren Levy <warrenl@cygnus.com>
- * @date April 22, 1999.
- */
-
-/**
- * Written using on-line Java Platform 1.2 API Specification, as well
+ * Written using on-line Java Platform 1.2/1.4 API Specification, as well
 * as "The Java Class Libraries", 2nd edition (Addison-Wesley, 1998).
 * Status:  Believed complete and correct.
 */

+ /**
+  * This utility class contains static methods that converts a 
+  * string into a fully encoded URL string in x-www-form-urlencoded
+  * format.  This format replaces certain disallowed characters with
+  * encoded equivalents.  All upper case and lower case letters in the
+  * US alphabet remain as is, the space character (' ') is replaced with
+  * '+' sign, and all other characters are converted to a "%XX" format
+  * where XX is the hexadecimal representation of that character in a
+  * certain encoding (by default "UTF-8").
+  * <p>
+  * This method is very useful for encoding strings to be sent to CGI scripts
+  *
+  * @author Aaron M. Renn (arenn@urbanophile.com)
+  * @author Warren Levy <warrenl@cygnus.com>
+  * @author Mark Wielaard (mark@klomp.org)
+  */
 public class URLEncoder
 {
-  // This method, per the JCL, is conservative in that it encodes
-  // some "allowable" characters as % triplets.
+  /**
+   * This method translates the passed in string into x-www-form-urlencoded
+   * format using the standard "UTF-8" character encoding to hex-encode the
+   * unsafe characters.
+   *
+   * @param s The String to convert
+   *
+   * @return The converted String
+   */
  public static String encode(String s)
  {
-    // Get the bytes in ISO-Latin-1 (i.e. 8859_1) per the JCL.
-    // Even though it is the default in most cases, it's specified here
-    // just in case System.getProperty("file.encoding") is not "8859_1".
-    String result = "";
    try
      {
-	byte[] buf = s.getBytes("8859_1");
-	int start = 0;
-	for (int i = 0; i < buf.length; i++)
-	  // For efficiency, check the byte in order of most likely
-	  // possibility so as to minimize the number of comparisons.
-	  // Hence, exclude all the alphanumeric & allowed special chars first.
-	  if ((buf[i] >= 'a' && buf[i] <= 'z') ||
-	      (buf[i] >= 'A' && buf[i] <= 'Z') ||
-	      (buf[i] >= '0' && buf[i] <= '9') ||
-	      buf[i] == '-' || buf[i] == '_' || buf[i] == '.' || buf[i] == '*')
-	    ; // This is the most likely case so exclude first for efficiency.
-	  else if (buf[i] == ' ')
-	    buf[i] = (byte) '+';  // Replace space char with plus symbol.
-	  else
-	    {
-	      result = result + new String(buf, start, i - start, "8859_1") +
-			"%" + Integer.toHexString(((int) buf[i]) & 0xFF);
-	      start = i + 1;
-	    }
-
-	// Append remainder of allowable chars from the string, if any.
-	if (start < buf.length)
-	  result = result +
-		   new String(buf, start, buf.length - start, "8859_1");
+        return encode(s, "UTF-8");
      }
-    catch (UnsupportedEncodingException ex)
+    catch (UnsupportedEncodingException uee)
      {
-	// This should never happen as "8859_1" is the default encoding.
+        // Should never happen since UTF-8 should always be supported
 	return s;
      }
-
-    return result;
  }
-}
+
+  /**
+   * This method translates the passed in string into x-www-form-urlencoded
+   * format using the character encoding to hex-encode the unsafe characters.
+   *
+   * @param s The String to convert
+   * @param encoding The encoding to use for unsafe characters
+   *
+   * @return The converted String
+   *
+   * @since 1.4
+   */
+  public static String encode(String s, String encoding)
+    throws UnsupportedEncodingException
+  {
+    StringBuffer result = new StringBuffer();
+    int length = s.length();
+    int start = 0;
+    int i = 0;
+
+    while (true)
+    {
+      while ( i < length && isSafe(s.charAt(i)) )
+	i++;
+
+      // Safe character can just be added
+      result.append(s.substring(start, i));
+
+      // Are we done?
+      if (i >= length)
+	return result.toString();
+      else if (s.charAt(i) == ' ')
+        {
+	  result.append('+');  // Replace space char with plus symbol.
+	  i++;
+	}
+      else
+	{
+	  // Get all unsafe characters
+	  start = i;
+	  char c;
+	  while ( i < length && (c = s.charAt(i)) != ' ' && !isSafe(c) )
+	    i++;
+
+	  // Convert them to %XY encoded strings
+	  String unsafe = s.substring(start,i);
+	  byte bytes[] = unsafe.getBytes(encoding);
+	  for (int j = 0; j < bytes.length; j++)
+	    {
+	      result.append('%');
+	      result.append(Integer.toHexString(((int) bytes[j]) & 0xFF));
+	    }
+	}
+      start = i;
+    }
+  }
+
+  /**
+   * Private static method that returns true if the given char is either
+   * a uppercase or lowercase letter from 'a' till 'z', or a digit froim
+   * '0' till '9', or one of the characters '-', '_', '.' or '*'. Such
+   * 'safe' character don't have to be url encoded.
+   */
+  private static boolean isSafe(char c)
+  {
+    return  ((c >= 'a' && c <= 'z') ||
+	     (c >= 'A' && c <= 'Z') ||
+	     (c >= '0' && c <= '9') ||
+	     c == '-' || c == '_' || c == '.' || c == '*');
+  }
+
+  /**
+   * Private constructor that does nothing. Included to avoid a default
+   * public constructor being created by the compiler.
+   */
+  private URLEncoder() { }
+
+} // class URLEncoder