From 94d73d4403ef98b3b38b8e3035c2eac87fb900f9 Mon Sep 17 00:00:00 2001 From: Christiaan Dirkx Date: Tue, 3 Nov 2020 19:26:31 +0100 Subject: [PATCH] Refactor `parse_prefix` on Windows Refactor `get_first_two_components` to `get_next_component`. Fixes the following behaviour of `parse_prefix`: - series of separator bytes in a prefix are correctly parsed as a single separator - device namespace prefixes correctly recognize both `\\` and `/` as separators --- library/std/src/ffi/os_str.rs | 4 +- library/std/src/path/tests.rs | 8 +- library/std/src/sys/windows/path.rs | 172 +++++++++++++--------- library/std/src/sys/windows/path/tests.rs | 39 ++++- 4 files changed, 140 insertions(+), 83 deletions(-) diff --git a/library/std/src/ffi/os_str.rs b/library/std/src/ffi/os_str.rs index 7e7a28be2b0..5d93016cadb 100644 --- a/library/std/src/ffi/os_str.rs +++ b/library/std/src/ffi/os_str.rs @@ -667,10 +667,10 @@ impl OsStr { /// Gets the underlying byte representation. /// - /// Note: it is *crucial* that this API is private, to avoid + /// Note: it is *crucial* that this API is not externally public, to avoid /// revealing the internal, platform-specific encodings. #[inline] - fn bytes(&self) -> &[u8] { + pub(crate) fn bytes(&self) -> &[u8] { unsafe { &*(&self.inner as *const _ as *const [u8]) } } diff --git a/library/std/src/path/tests.rs b/library/std/src/path/tests.rs index ff94fda5a22..896d6c2a64c 100644 --- a/library/std/src/path/tests.rs +++ b/library/std/src/path/tests.rs @@ -873,12 +873,12 @@ pub fn test_decompositions_windows() { ); t!("\\\\.\\foo/bar", - iter: ["\\\\.\\foo/bar", "\\"], + iter: ["\\\\.\\foo", "\\", "bar"], has_root: true, is_absolute: true, - parent: None, - file_name: None, - file_stem: None, + parent: Some("\\\\.\\foo/"), + file_name: Some("bar"), + file_stem: Some("bar"), extension: None ); diff --git a/library/std/src/sys/windows/path.rs b/library/std/src/sys/windows/path.rs index dda3ed68cfc..c10c0df4a3a 100644 --- a/library/std/src/sys/windows/path.rs +++ b/library/std/src/sys/windows/path.rs @@ -8,15 +8,12 @@ mod tests; pub const MAIN_SEP_STR: &str = "\\"; pub const MAIN_SEP: char = '\\'; -// The unsafety here stems from converting between `&OsStr` and `&[u8]` -// and back. This is safe to do because (1) we only look at ASCII -// contents of the encoding and (2) new &OsStr values are produced -// only from ASCII-bounded slices of existing &OsStr values. -fn os_str_as_u8_slice(s: &OsStr) -> &[u8] { - unsafe { mem::transmute(s) } -} -unsafe fn u8_slice_as_os_str(s: &[u8]) -> &OsStr { - mem::transmute(s) +// Safety: `bytes` must be a valid wtf8 encoded slice +#[inline] +unsafe fn bytes_as_os_str(bytes: &[u8]) -> &OsStr { + // &OsStr is layout compatible with &Slice, which is compatible with &Wtf8, + // which is compatible with &[u8]. + mem::transmute(bytes) } #[inline] @@ -29,79 +26,116 @@ pub fn is_verbatim_sep(b: u8) -> bool { b == b'\\' } -// In most DOS systems, it is not possible to have more than 26 drive letters. -// See . -pub fn is_valid_drive_letter(disk: u8) -> bool { - disk.is_ascii_alphabetic() -} - pub fn parse_prefix(path: &OsStr) -> Option> { use Prefix::{DeviceNS, Disk, Verbatim, VerbatimDisk, VerbatimUNC, UNC}; - let path = os_str_as_u8_slice(path); + if let Some(path) = strip_prefix(path, r"\\") { + // \\ + if let Some(path) = strip_prefix(path, r"?\") { + // \\?\ + if let Some(path) = strip_prefix(path, r"UNC\") { + // \\?\UNC\server\share - // \\ - if let Some(path) = path.strip_prefix(br"\\") { - // \\?\ - if let Some(path) = path.strip_prefix(br"?\") { - // \\?\UNC\server\share - if let Some(path) = path.strip_prefix(br"UNC\") { - let (server, share) = match get_first_two_components(path, is_verbatim_sep) { - Some((server, share)) => unsafe { - (u8_slice_as_os_str(server), u8_slice_as_os_str(share)) - }, - None => (unsafe { u8_slice_as_os_str(path) }, OsStr::new("")), - }; - return Some(VerbatimUNC(server, share)); + let (server, path) = parse_next_component(path, true); + let (share, _) = parse_next_component(path, true); + + Some(VerbatimUNC(server, share)) } else { - // \\?\path - match path { - // \\?\C:\path - [c, b':', b'\\', ..] if is_valid_drive_letter(*c) => { - return Some(VerbatimDisk(c.to_ascii_uppercase())); - } - // \\?\cat_pics - _ => { - let idx = path.iter().position(|&b| b == b'\\').unwrap_or(path.len()); - let slice = &path[..idx]; - return Some(Verbatim(unsafe { u8_slice_as_os_str(slice) })); - } + let (prefix, _) = parse_next_component(path, true); + + // in verbatim paths only recognize an exact drive prefix + if let Some(drive) = parse_drive_exact(prefix) { + // \\?\C: + Some(VerbatimDisk(drive)) + } else { + // \\?\prefix + Some(Verbatim(prefix)) } } - } else if let Some(path) = path.strip_prefix(b".\\") { + } else if let Some(path) = strip_prefix(path, r".\") { // \\.\COM42 - let idx = path.iter().position(|&b| b == b'\\').unwrap_or(path.len()); - let slice = &path[..idx]; - return Some(DeviceNS(unsafe { u8_slice_as_os_str(slice) })); - } - match get_first_two_components(path, is_sep_byte) { - Some((server, share)) if !server.is_empty() && !share.is_empty() => { + let (prefix, _) = parse_next_component(path, false); + Some(DeviceNS(prefix)) + } else { + let (server, path) = parse_next_component(path, false); + let (share, _) = parse_next_component(path, false); + + if !server.is_empty() && !share.is_empty() { // \\server\share - return Some(unsafe { UNC(u8_slice_as_os_str(server), u8_slice_as_os_str(share)) }); + Some(UNC(server, share)) + } else { + // no valid prefix beginning with "\\" recognized + None } - _ => {} } - } else if let [c, b':', ..] = path { + } else if let Some(drive) = parse_drive(path) { // C: - if is_valid_drive_letter(*c) { - return Some(Disk(c.to_ascii_uppercase())); - } + Some(Disk(drive)) + } else { + // no prefix + None } - None } -/// Returns the first two path components with predicate `f`. -/// -/// The two components returned will be use by caller -/// to construct `VerbatimUNC` or `UNC` Windows path prefix. -/// -/// Returns [`None`] if there are no separators in path. -fn get_first_two_components(path: &[u8], f: fn(u8) -> bool) -> Option<(&[u8], &[u8])> { - let idx = path.iter().position(|&x| f(x))?; - // Panic safe - // The max `idx+1` is `path.len()` and `path[path.len()..]` is a valid index. - let (first, path) = (&path[..idx], &path[idx + 1..]); - let idx = path.iter().position(|&x| f(x)).unwrap_or(path.len()); - let second = &path[..idx]; - Some((first, second)) +// Parses a drive prefix, e.g. "C:" and "C:\whatever" +fn parse_drive(prefix: &OsStr) -> Option { + // In most DOS systems, it is not possible to have more than 26 drive letters. + // See . + fn is_valid_drive_letter(drive: &u8) -> bool { + drive.is_ascii_alphabetic() + } + + match prefix.bytes() { + [drive, b':', ..] if is_valid_drive_letter(drive) => Some(drive.to_ascii_uppercase()), + _ => None, + } +} + +// Parses a drive prefix exactly, e.g. "C:" +fn parse_drive_exact(prefix: &OsStr) -> Option { + // only parse two bytes: the drive letter and the drive separator + if prefix.len() == 2 { parse_drive(prefix) } else { None } +} + +fn strip_prefix<'a>(path: &'a OsStr, prefix: &str) -> Option<&'a OsStr> { + // `path` and `prefix` are valid wtf8 and utf8 encoded slices respectively, `path[prefix.len()]` + // is thus a code point boundary and `path[prefix.len()..]` is a valid wtf8 encoded slice. + match path.bytes().strip_prefix(prefix.as_bytes()) { + Some(path) => unsafe { Some(bytes_as_os_str(path)) }, + None => None, + } +} + +// Parse the next path component. +// +// Returns the next component and the rest of the path excluding the component and separator. +// Does not recognize `/` as a separator character if `verbatim` is true. +fn parse_next_component(path: &OsStr, verbatim: bool) -> (&OsStr, &OsStr) { + let separator = if verbatim { is_verbatim_sep } else { is_sep_byte }; + + match path.bytes().iter().position(|&x| separator(x)) { + Some(separator_start) => { + let mut separator_end = separator_start + 1; + + // a series of multiple separator characters is treated as a single separator, + // except in verbatim paths + while !verbatim && separator_end < path.len() && separator(path.bytes()[separator_end]) + { + separator_end += 1; + } + + let component = &path.bytes()[..separator_start]; + + // Panic safe + // The max `separator_end` is `bytes.len()` and `bytes[bytes.len()..]` is a valid index. + let path = &path.bytes()[separator_end..]; + + // Safety: `path` is a valid wtf8 encoded slice and each of the separators ('/', '\') + // is encoded in a single byte, therefore `bytes[separator_start]` and + // `bytes[separator_end]` must be code point boundaries and thus + // `bytes[..separator_start]` and `bytes[separator_end..]` are valid wtf8 slices. + unsafe { (bytes_as_os_str(component), bytes_as_os_str(path)) } + } + None => (path, OsStr::new("")), + } } diff --git a/library/std/src/sys/windows/path/tests.rs b/library/std/src/sys/windows/path/tests.rs index fbac1dc1ca1..9675da6ff88 100644 --- a/library/std/src/sys/windows/path/tests.rs +++ b/library/std/src/sys/windows/path/tests.rs @@ -1,21 +1,44 @@ use super::*; #[test] -fn test_get_first_two_components() { +fn test_parse_next_component() { assert_eq!( - get_first_two_components(br"server\share", is_verbatim_sep), - Some((&b"server"[..], &b"share"[..])), + parse_next_component(OsStr::new(r"server\share"), true), + (OsStr::new(r"server"), OsStr::new(r"share")) ); assert_eq!( - get_first_two_components(br"server\", is_verbatim_sep), - Some((&b"server"[..], &b""[..])) + parse_next_component(OsStr::new(r"server/share"), true), + (OsStr::new(r"server/share"), OsStr::new(r"")) ); assert_eq!( - get_first_two_components(br"\server\", is_verbatim_sep), - Some((&b""[..], &b"server"[..])) + parse_next_component(OsStr::new(r"server/share"), false), + (OsStr::new(r"server"), OsStr::new(r"share")) ); - assert_eq!(get_first_two_components(br"there are no separators here", is_verbatim_sep), None,); + assert_eq!( + parse_next_component(OsStr::new(r"server\"), false), + (OsStr::new(r"server"), OsStr::new(r"")) + ); + + assert_eq!( + parse_next_component(OsStr::new(r"\server\"), false), + (OsStr::new(r""), OsStr::new(r"server\")) + ); + + assert_eq!( + parse_next_component(OsStr::new(r"servershare"), false), + (OsStr::new(r"servershare"), OsStr::new("")) + ); + + assert_eq!( + parse_next_component(OsStr::new(r"server/\//\/\\\\/////\/share"), false), + (OsStr::new(r"server"), OsStr::new(r"share")) + ); + + assert_eq!( + parse_next_component(OsStr::new(r"server\\\\\\\\\\\\\\share"), true), + (OsStr::new(r"server"), OsStr::new(r"\\\\\\\\\\\\\share")) + ); }