c756026608
* java/text/RuleBasedCollator.java (RuleBasedCollator): Check rules not empty, fixed search in already existing collation elements. (is_special): Removed common whitespace characters. (text_argument): Dont return on whitespaces, add characters between two ' to string buffer. From-SVN: r72716
585 lines
18 KiB
Java
585 lines
18 KiB
Java
/* RuleBasedCollator.java -- Concrete Collator Class
|
|
Copyright (C) 1998, 1999, 2000, 2001, 2003 Free Software Foundation, Inc.
|
|
|
|
This file is part of GNU Classpath.
|
|
|
|
GNU Classpath is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2, or (at your option)
|
|
any later version.
|
|
|
|
GNU Classpath is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with GNU Classpath; see the file COPYING. If not, write to the
|
|
Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
|
02111-1307 USA.
|
|
|
|
Linking this library statically or dynamically with other modules is
|
|
making a combined work based on this library. Thus, the terms and
|
|
conditions of the GNU General Public License cover the whole
|
|
combination.
|
|
|
|
As a special exception, the copyright holders of this library give you
|
|
permission to link this library with independent modules to produce an
|
|
executable, regardless of the license terms of these independent
|
|
modules, and to copy and distribute the resulting executable under
|
|
terms of your choice, provided that you also meet, for each linked
|
|
independent module, the terms and conditions of the license of that
|
|
module. An independent module is a module which is not derived from
|
|
or based on this library. If you modify this library, you may extend
|
|
this exception to your version of the library, but you are not
|
|
obligated to do so. If you do not wish to do so, delete this
|
|
exception statement from your version. */
|
|
|
|
|
|
package java.text;
|
|
|
|
import java.util.Enumeration;
|
|
import java.util.Hashtable;
|
|
import java.util.Vector;
|
|
|
|
/* Written using "Java Class Libraries", 2nd edition, plus online
|
|
* API docs for JDK 1.2 from http://www.javasoft.com.
|
|
* Status: Believed complete and correct
|
|
*/
|
|
|
|
/**
|
|
* This class is a concrete subclass of <code>Collator</code> suitable
|
|
* for string collation in a wide variety of languages. An instance of
|
|
* this class is normally returned by the <code>getInstance</code> method
|
|
* of <code>Collator</code> with rules predefined for the requested
|
|
* locale. However, an instance of this class can be created manually
|
|
* with any desired rules.
|
|
* <p>
|
|
* Rules take the form of a <code>String</code> with the following syntax
|
|
* <ul>
|
|
* <li> Modifier: '@'
|
|
* <li> Relation: '<' | ';' | ',' | '=' : <text>
|
|
* <li> Reset: '&' : <text>
|
|
* </ul>
|
|
* The modifier character indicates that accents sort backward as is the
|
|
* case with French. The relational operators specify how the text
|
|
* argument relates to the previous term. The relation characters have
|
|
* the following meanings:
|
|
* <ul>
|
|
* <li>'<' - The text argument is greater than the prior term at the primary
|
|
* difference level.
|
|
* <li>';' - The text argument is greater than the prior term at the secondary
|
|
* difference level.
|
|
* <li>',' - The text argument is greater than the prior term at the tertiary
|
|
* difference level.
|
|
* <li>'=' - The text argument is equal to the prior term
|
|
* </ul>
|
|
* <p>
|
|
* As for the text argument itself, this is any sequence of Unicode
|
|
* characters not in the following ranges: 0x0009-0x000D, 0x0020-0x002F,
|
|
* 0x003A-0x0040, 0x005B-0x0060, and 0x007B-0x007E. If these characters are
|
|
* desired, they must be enclosed in single quotes. If any whitespace is
|
|
* encountered, it is ignored. (For example, "a b" is equal to "ab").
|
|
* <p>
|
|
* The reset operation inserts the following rule at the point where the
|
|
* text argument to it exists in the previously declared rule string. This
|
|
* makes it easy to add new rules to an existing string by simply including
|
|
* them in a reset sequence at the end. Note that the text argument, or
|
|
* at least the first character of it, must be present somewhere in the
|
|
* previously declared rules in order to be inserted properly. If this
|
|
* is not satisfied, a <code>ParseException</code> will be thrown.
|
|
* <p>
|
|
* This system of configuring <code>RuleBasedCollator</code> is needlessly
|
|
* complex and the people at Taligent who developed it (along with the folks
|
|
* at Sun who accepted it into the Java standard library) deserve a slow
|
|
* and agonizing death.
|
|
* <p>
|
|
* Here are a couple of example of rule strings:
|
|
* <p>
|
|
* "< a < b < c" - This string says that a is greater than b which is
|
|
* greater than c, with all differences being primary differences.
|
|
* <p>
|
|
* "< a,A < b,B < c,C" - This string says that 'A' is greater than 'a' with
|
|
* a tertiary strength comparison. Both 'b' and 'B' are greater than 'a' and
|
|
* 'A' during a primary strength comparison. But 'B' is greater than 'b'
|
|
* under a tertiary strength comparison.
|
|
* <p>
|
|
* "< a < c & a < b " - This sequence is identical in function to the
|
|
* "< a < b < c" rule string above. The '&' reset symbol indicates that
|
|
* the rule "< b" is to be inserted after the text argument "a" in the
|
|
* previous rule string segment.
|
|
* <p>
|
|
* "< a < b & y < z" - This is an error. The character 'y' does not appear
|
|
* anywhere in the previous rule string segment so the rule following the
|
|
* reset rule cannot be inserted.
|
|
* <p>
|
|
* For a description of the various comparison strength types, see the
|
|
* documentation for the <code>Collator</code> class.
|
|
* <p>
|
|
* As an additional complication to this already overly complex rule scheme,
|
|
* if any characters precede the first rule, these characters are considered
|
|
* ignorable. They will be treated as if they did not exist during
|
|
* comparisons. For example, "- < a < b ..." would make '-' an ignorable
|
|
* character such that the strings "high-tech" and "hightech" would
|
|
* be considered identical.
|
|
* <p>
|
|
* A <code>ParseException</code> will be thrown for any of the following
|
|
* conditions:
|
|
* <ul>
|
|
* <li>Unquoted punctuation characters in a text argument.
|
|
* <li>A relational or reset operator not followed by a text argument
|
|
* <li>A reset operator where the text argument is not present in
|
|
* the previous rule string section.
|
|
* </ul>
|
|
*
|
|
* @author Aaron M. Renn <arenn@urbanophile.com>
|
|
* @author Tom Tromey <tromey@cygnus.com>
|
|
* @date March 25, 1999
|
|
*/
|
|
|
|
public class RuleBasedCollator extends Collator
|
|
{
|
|
final class CollationElement
|
|
{
|
|
String key;
|
|
char relation;
|
|
|
|
CollationElement (String key, char relation)
|
|
{
|
|
this.key = key;
|
|
this.relation = relation;
|
|
}
|
|
|
|
} // inner class CollationElement
|
|
|
|
// True if we are using French-style accent ordering.
|
|
private boolean frenchAccents;
|
|
|
|
/**
|
|
* This the the original rule string.
|
|
*/
|
|
private String rules;
|
|
|
|
// This maps strings onto collation values.
|
|
private Hashtable map;
|
|
|
|
// An entry in this hash means that more lookahead is required for
|
|
// the prefix string.
|
|
private Hashtable prefixes;
|
|
|
|
/**
|
|
* This method initializes a new instance of <code>RuleBasedCollator</code>
|
|
* with the specified collation rules. Note that an application normally
|
|
* obtains an instance of <code>RuleBasedCollator</code> by calling the
|
|
* <code>getInstance</code> method of <code>Collator</code>. That method
|
|
* automatically loads the proper set of rules for the desired locale.
|
|
*
|
|
* @param rules The collation rule string.
|
|
*
|
|
* @exception ParseException If the rule string contains syntax errors.
|
|
*/
|
|
public RuleBasedCollator (String rules) throws ParseException
|
|
{
|
|
if (rules.equals (""))
|
|
throw new ParseException ("empty rule set", 0);
|
|
|
|
this.rules = rules;
|
|
this.frenchAccents = false;
|
|
|
|
// We keep each rule in order in a vector. At the end we traverse
|
|
// the vector and compute collation values from it.
|
|
int insertion_index = 0;
|
|
Vector vec = new Vector ();
|
|
|
|
StringBuffer argument = new StringBuffer ();
|
|
|
|
int len = rules.length();
|
|
for (int index = 0; index < len; ++index)
|
|
{
|
|
char c = rules.charAt(index);
|
|
|
|
// Just skip whitespace.
|
|
if (Character.isWhitespace(c))
|
|
continue;
|
|
|
|
// Modifier.
|
|
if (c == '@')
|
|
{
|
|
frenchAccents = true;
|
|
continue;
|
|
}
|
|
|
|
// Check for relation or reset operator.
|
|
if (! (c == '<' || c == ';' || c == ',' || c == '=' || c == '&'))
|
|
throw new ParseException ("invalid character", index);
|
|
|
|
++index;
|
|
while (index < len)
|
|
{
|
|
if (! Character.isWhitespace(rules.charAt(index)))
|
|
break;
|
|
++index;
|
|
}
|
|
if (index == len)
|
|
throw new ParseException ("missing argument", index);
|
|
|
|
int save = index;
|
|
index = text_argument (rules, index, argument);
|
|
if (argument.length() == 0)
|
|
throw new ParseException ("invalid character", save);
|
|
String arg = argument.toString();
|
|
int item_index = -1;
|
|
|
|
for (int j = 0; j < vec.size(); ++j)
|
|
{
|
|
CollationElement e = (CollationElement) vec.elementAt (j);
|
|
|
|
if (arg.equals (e.key))
|
|
{
|
|
item_index = j;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (c != '&')
|
|
{
|
|
// If the argument already appears in the vector, then we
|
|
// must remove it in order to re-order.
|
|
if (item_index != -1)
|
|
{
|
|
vec.removeElementAt(item_index);
|
|
if (insertion_index >= item_index)
|
|
--insertion_index;
|
|
}
|
|
CollationElement r = new CollationElement (arg, c);
|
|
vec.insertElementAt(r, insertion_index);
|
|
++insertion_index;
|
|
}
|
|
else
|
|
{
|
|
// Reset.
|
|
if (item_index == -1)
|
|
throw
|
|
new ParseException ("argument to reset not previously seen",
|
|
save);
|
|
insertion_index = item_index + 1;
|
|
}
|
|
|
|
// Ugly: in this case the resulting INDEX comes from
|
|
// text_argument, which returns the index of the next
|
|
// character we should examine.
|
|
--index;
|
|
}
|
|
|
|
// Now construct a hash table that maps strings onto their
|
|
// collation values.
|
|
int primary = 0;
|
|
int secondary = 0;
|
|
int tertiary = 0;
|
|
this.map = new Hashtable ();
|
|
this.prefixes = new Hashtable ();
|
|
Enumeration e = vec.elements();
|
|
while (e.hasMoreElements())
|
|
{
|
|
CollationElement r = (CollationElement) e.nextElement();
|
|
switch (r.relation)
|
|
{
|
|
case '<':
|
|
++primary;
|
|
secondary = 0;
|
|
tertiary = 0;
|
|
break;
|
|
case ';':
|
|
++secondary;
|
|
tertiary = 0;
|
|
break;
|
|
case ',':
|
|
++tertiary;
|
|
break;
|
|
case '=':
|
|
break;
|
|
}
|
|
// This must match CollationElementIterator.
|
|
map.put(r.key, new Integer (primary << 16
|
|
| secondary << 8 | tertiary));
|
|
|
|
// Make a map of all lookaheads we might need.
|
|
for (int i = r.key.length() - 1; i >= 1; --i)
|
|
prefixes.put(r.key.substring(0, i), Boolean.TRUE);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* This method creates a copy of this object.
|
|
*
|
|
* @return A copy of this object.
|
|
*/
|
|
public Object clone()
|
|
{
|
|
RuleBasedCollator c = (RuleBasedCollator) super.clone ();
|
|
c.map = (Hashtable) map.clone ();
|
|
c.prefixes = (Hashtable) map.clone ();
|
|
return c;
|
|
}
|
|
|
|
// A helper for CollationElementIterator.next().
|
|
int ceiNext (CollationElementIterator cei)
|
|
{
|
|
if (cei.lookahead_set)
|
|
{
|
|
cei.lookahead_set = false;
|
|
return cei.lookahead;
|
|
}
|
|
|
|
int save = cei.index;
|
|
int max = cei.text.length();
|
|
String s = null;
|
|
|
|
// It is possible to have a case where `abc' has a mapping, but
|
|
// neither `ab' nor `abd' do. In this case we must treat `abd' as
|
|
// nothing special.
|
|
boolean found = false;
|
|
|
|
int i;
|
|
for (i = save + 1; i <= max; ++i)
|
|
{
|
|
s = cei.text.substring(save, i);
|
|
if (prefixes.get(s) == null)
|
|
break;
|
|
found = true;
|
|
}
|
|
// Assume s != null.
|
|
|
|
Object obj = map.get(s);
|
|
// The special case.
|
|
while (found && obj == null && s.length() > 1)
|
|
{
|
|
--i;
|
|
s = cei.text.substring(save, i);
|
|
obj = map.get(s);
|
|
}
|
|
|
|
// Update state.
|
|
cei.index = i;
|
|
|
|
if (obj == null)
|
|
{
|
|
// This idea, and the values, come from JDK.
|
|
// assert (s.length() == 1)
|
|
cei.lookahead_set = true;
|
|
cei.lookahead = s.charAt(0) << 8;
|
|
return 0x7fff << 16;
|
|
}
|
|
|
|
return ((Integer) obj).intValue();
|
|
}
|
|
|
|
// A helper for compareTo() that returns the next character that has
|
|
// a nonzero ordering at the indicated strength. This is also used
|
|
// in CollationKey.
|
|
static final int next (CollationElementIterator iter, int strength)
|
|
{
|
|
while (true)
|
|
{
|
|
int os = iter.next();
|
|
if (os == CollationElementIterator.NULLORDER)
|
|
return os;
|
|
int c = 0;
|
|
switch (strength)
|
|
{
|
|
case PRIMARY:
|
|
c = os & ~0xffff;
|
|
break;
|
|
case SECONDARY:
|
|
c = os & ~0x00ff;
|
|
break;
|
|
case TERTIARY:
|
|
case IDENTICAL:
|
|
c = os;
|
|
break;
|
|
}
|
|
if (c != 0)
|
|
return c;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* This method returns an integer which indicates whether the first
|
|
* specified <code>String</code> is less than, greater than, or equal to
|
|
* the second. The value depends not only on the collation rules in
|
|
* effect, but also the strength and decomposition settings of this object.
|
|
*
|
|
* @param s1 The first <code>String</code> to compare.
|
|
* @param s2 A second <code>String</code> to compare to the first.
|
|
*
|
|
* @return A negative integer if s1 < s2, a positive integer
|
|
* if s1 > s2, or 0 if s1 == s2.
|
|
*/
|
|
public int compare (String source, String target)
|
|
{
|
|
CollationElementIterator cs, ct;
|
|
|
|
cs = new CollationElementIterator (source, this);
|
|
ct = new CollationElementIterator (target, this);
|
|
|
|
while (true)
|
|
{
|
|
int os = next (cs, strength);
|
|
int ot = next (ct, strength);
|
|
|
|
if (os == CollationElementIterator.NULLORDER
|
|
&& ot == CollationElementIterator.NULLORDER)
|
|
break;
|
|
else if (os == CollationElementIterator.NULLORDER)
|
|
{
|
|
// Source string is shorter, so return "less than".
|
|
return -1;
|
|
}
|
|
else if (ot == CollationElementIterator.NULLORDER)
|
|
{
|
|
// Target string is shorter, so return "greater than".
|
|
return 1;
|
|
}
|
|
|
|
if (os != ot)
|
|
return os - ot;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* This method tests this object for equality against the specified
|
|
* object. This will be true if and only if the specified object is
|
|
* another reference to this object.
|
|
*
|
|
* @param obj The <code>Object</code> to compare against this object.
|
|
*
|
|
* @return <code>true</code> if the specified object is equal to this object, <code>false</code> otherwise.
|
|
*/
|
|
public boolean equals (Object obj)
|
|
{
|
|
if (! (obj instanceof RuleBasedCollator) || ! super.equals(obj))
|
|
return false;
|
|
RuleBasedCollator rbc = (RuleBasedCollator) obj;
|
|
// FIXME: this is probably wrong. Instead we should compare maps
|
|
// directly.
|
|
return (frenchAccents == rbc.frenchAccents
|
|
&& rules.equals(rbc.rules));
|
|
}
|
|
|
|
/**
|
|
* This method returns an instance for <code>CollationElementIterator</code>
|
|
* for the specified <code>String</code> under the collation rules for this
|
|
* object.
|
|
*
|
|
* @param str The <code>String</code> to return the <code>CollationElementIterator</code> instance for.
|
|
*
|
|
* @return A <code>CollationElementIterator</code> for the specified <code>String</code>.
|
|
*/
|
|
public CollationElementIterator getCollationElementIterator (String source)
|
|
{
|
|
StringBuffer expand = new StringBuffer (source.length());
|
|
int max = source.length();
|
|
for (int i = 0; i < max; ++i)
|
|
decomposeCharacter (source.charAt(i), expand);
|
|
return new CollationElementIterator (expand.toString(), this);
|
|
}
|
|
|
|
/**
|
|
* This method returns an instance of <code>CollationElementIterator</code>
|
|
* for the <code>String</code> represented by the specified
|
|
* <code>CharacterIterator</code>.
|
|
*
|
|
* @param ci The <code>CharacterIterator</code> with the desired <code>String</code>.
|
|
*
|
|
* @return A <code>CollationElementIterator</code> for the specified <code>String</code>.
|
|
*/
|
|
public CollationElementIterator getCollationElementIterator (CharacterIterator source)
|
|
{
|
|
StringBuffer expand = new StringBuffer ();
|
|
for (char c = source.first ();
|
|
c != CharacterIterator.DONE;
|
|
c = source.next ())
|
|
decomposeCharacter (c, expand);
|
|
|
|
return new CollationElementIterator (expand.toString(), this);
|
|
}
|
|
|
|
/**
|
|
* This method returns an instance of <code>CollationKey</code> for the
|
|
* specified <code>String</code>. The object returned will have a
|
|
* more efficient mechanism for its comparison function that could
|
|
* provide speed benefits if multiple comparisons are performed, such
|
|
* as during a sort.
|
|
*
|
|
* @param str The <code>String</code> to create a <code>CollationKey</code> for.
|
|
*
|
|
* @return A <code>CollationKey</code> for the specified <code>String</code>.
|
|
*/
|
|
public CollationKey getCollationKey (String source)
|
|
{
|
|
return new CollationKey (getCollationElementIterator (source), source,
|
|
strength);
|
|
}
|
|
|
|
/**
|
|
* This method returns a <code>String</code> containing the collation rules
|
|
* for this object.
|
|
*
|
|
* @return The collation rules for this object.
|
|
*/
|
|
public String getRules()
|
|
{
|
|
return rules;
|
|
}
|
|
|
|
/**
|
|
* This method returns a hash value for this object.
|
|
*
|
|
* @return A hash value for this object.
|
|
*/
|
|
public int hashCode()
|
|
{
|
|
return (frenchAccents ? 1231 : 1237
|
|
^ rules.hashCode()
|
|
^ map.hashCode()
|
|
^ prefixes.hashCode());
|
|
}
|
|
|
|
private final boolean is_special (char c)
|
|
{
|
|
// Rules from JCL book.
|
|
return ((c >= 0x0021 && c <= 0x002f)
|
|
|| (c >= 0x003a && c <= 0x0040)
|
|
|| (c >= 0x005b && c <= 0x0060)
|
|
|| (c >= 0x007b && c <= 0x007e));
|
|
}
|
|
|
|
private final int text_argument (String rules, int index,
|
|
StringBuffer result)
|
|
{
|
|
result.setLength(0);
|
|
int len = rules.length();
|
|
while (index < len)
|
|
{
|
|
char c = rules.charAt (index);
|
|
if (c == '\''
|
|
&& index + 2 < len
|
|
&& rules.charAt (index + 2) == '\'')
|
|
{
|
|
result.append (rules.charAt (index + 1));
|
|
index += 2;
|
|
}
|
|
else if (is_special (c))
|
|
return index;
|
|
else if (!Character.isWhitespace (c))
|
|
result.append (c);
|
|
|
|
++index;
|
|
}
|
|
return index;
|
|
}
|
|
|
|
}
|