From ef08b2339db40b12b7061da1b18a406fd1b94983 Mon Sep 17 00:00:00 2001 From: Gareth Smith Date: Wed, 11 Sep 2013 23:46:33 +0100 Subject: [PATCH] Support character range patterns (e.g. [0-9], [a-z]), like other globs do. --- src/libextra/glob.rs | 140 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 126 insertions(+), 14 deletions(-) diff --git a/src/libextra/glob.rs b/src/libextra/glob.rs index d82c1fd35c2..07386b41caa 100644 --- a/src/libextra/glob.rs +++ b/src/libextra/glob.rs @@ -147,8 +147,14 @@ enum PatternToken { Char(char), AnyChar, AnySequence, - AnyWithin(~[char]), - AnyExcept(~[char]) + AnyWithin(~[CharSpecifier]), + AnyExcept(~[CharSpecifier]) +} + +#[deriving(Clone, Eq, TotalEq, Ord, TotalOrd, IterBytes)] +enum CharSpecifier { + SingleChar(char), + CharRange(char, char) } #[deriving(Eq)] @@ -164,12 +170,15 @@ impl Pattern { * This function compiles Unix shell style patterns: `?` matches any single character, * `*` matches any (possibly empty) sequence of characters and `[...]` matches any character * inside the brackets, unless the first character is `!` in which case it matches any - * character except those between the `!` and the `]`. + * character except those between the `!` and the `]`. Character sequences can also specify + * ranges of characters, as ordered by Unicode, so e.g. `[0-9]` specifies any character + * between 0 and 9 inclusive. * * The metacharacters `?`, `*`, `[`, `]` can be matched by using brackets (e.g. `[?]`). * When a `]` occurs immediately following `[` or `[!` then it is interpreted as * being part of, rather then ending, the character set, so `]` and NOT `]` can be - * matched by `[]]` and `[!]]` respectively. + * matched by `[]]` and `[!]]` respectively. The `-` character can be specified inside a + * character sequence pattern by placing it at the start or the end, e.g. `[abc-]`. * * When a `[` does not have a closing `]` before the end of the string then the `[` will * be treated literally. @@ -199,7 +208,8 @@ impl Pattern { match chars.slice_from(i + 3).position_elem(&']') { None => (), Some(j) => { - tokens.push(AnyExcept(chars.slice(i + 2, i + 3 + j).to_owned())); + let cs = parse_char_specifiers(chars.slice(i + 2, i + 3 + j)); + tokens.push(AnyExcept(cs)); i += j + 4; loop; } @@ -209,7 +219,8 @@ impl Pattern { match chars.slice_from(i + 2).position_elem(&']') { None => (), Some(j) => { - tokens.push(AnyWithin(chars.slice(i + 1, i + 2 + j).to_owned())); + let cs = parse_char_specifiers(chars.slice(i + 1, i + 2 + j)); + tokens.push(AnyWithin(cs)); i += j + 3; loop; } @@ -335,15 +346,11 @@ impl Pattern { AnyChar => { !require_literal(c) } - AnyWithin(ref chars) => { - !require_literal(c) && - chars.iter() - .rposition(|&e| chars_eq(e, c, options.case_sensitive)).is_some() + AnyWithin(ref specifiers) => { + !require_literal(c) && in_char_specifiers(*specifiers, c, options) } - AnyExcept(ref chars) => { - !require_literal(c) && - chars.iter() - .rposition(|&e| chars_eq(e, c, options.case_sensitive)).is_none() + AnyExcept(ref specifiers) => { + !require_literal(c) && !in_char_specifiers(*specifiers, c, options) } Char(c2) => { chars_eq(c, c2, options.case_sensitive) @@ -370,6 +377,63 @@ impl Pattern { } +fn parse_char_specifiers(s: &[char]) -> ~[CharSpecifier] { + let mut cs = ~[]; + let mut i = 0; + while i < s.len() { + if i + 3 <= s.len() && s[i + 1] == '-' { + cs.push(CharRange(s[i], s[i + 2])); + i += 3; + } else { + cs.push(SingleChar(s[i])); + i += 1; + } + } + cs +} + +fn in_char_specifiers(specifiers: &[CharSpecifier], c: char, options: MatchOptions) -> bool { + + for &specifier in specifiers.iter() { + match specifier { + SingleChar(sc) => { + if chars_eq(c, sc, options.case_sensitive) { + return true; + } + } + CharRange(start, end) => { + + // FIXME: work with non-ascii chars properly (issue #1347) + if !options.case_sensitive && c.is_ascii() && start.is_ascii() && end.is_ascii() { + + let start = start.to_ascii().to_lower(); + let end = end.to_ascii().to_lower(); + + let start_up = start.to_upper(); + let end_up = end.to_upper(); + + // only allow case insensitive matching when + // both start and end are within a-z or A-Z + if start != start_up && end != end_up { + let start = start.to_char(); + let end = end.to_char(); + let c = c.to_ascii().to_lower().to_char(); + if c >= start && c <= end { + return true; + } + } + } + + if c >= start && c <= end { + return true; + } + } + } + } + + false +} + /// A helper function to determine if two chars are (possibly case-insensitively) equal. fn chars_eq(a: char, b: char, case_sensitive: bool) -> bool { if cfg!(windows) && path::windows::is_sep(a) && path::windows::is_sep(b) { @@ -672,6 +736,54 @@ mod test { glob("/*/*/*/*").skip(10000).next(); } + #[test] + fn test_range_pattern() { + + let pat = Pattern::new("a[0-9]b"); + for i in range(0, 10) { + assert!(pat.matches(fmt!("a%db", i))); + } + assert!(!pat.matches("a_b")); + + let pat = Pattern::new("a[!0-9]b"); + for i in range(0, 10) { + assert!(!pat.matches(fmt!("a%db", i))); + } + assert!(pat.matches("a_b")); + + let pats = ["[a-z123]", "[1a-z23]", "[123a-z]"]; + for &p in pats.iter() { + let pat = Pattern::new(p); + for c in "abcdefghijklmnopqrstuvwxyz".iter() { + assert!(pat.matches(c.to_str())); + } + for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ".iter() { + let options = MatchOptions {case_sensitive: false, .. MatchOptions::new()}; + assert!(pat.matches_with(c.to_str(), options)); + } + assert!(pat.matches("1")); + assert!(pat.matches("2")); + assert!(pat.matches("3")); + } + + let pats = ["[abc-]", "[-abc]", "[a-c-]"]; + for &p in pats.iter() { + let pat = Pattern::new(p); + assert!(pat.matches("a")); + assert!(pat.matches("b")); + assert!(pat.matches("c")); + assert!(pat.matches("-")); + assert!(!pat.matches("d")); + } + + let pat = Pattern::new("[2-1]"); + assert!(!pat.matches("1")); + assert!(!pat.matches("2")); + + assert!(Pattern::new("[-]").matches("-")); + assert!(!Pattern::new("[!-]").matches("-")); + } + #[test] fn test_unclosed_bracket() { // unclosed `[` should be treated literally