Auto merge of #78833 - CDirkx:parse_prefix, r=dtolnay

Refactor and fix `parse_prefix` on Windows

This PR is an extension of #78692 as well as a general refactor of `parse_prefix`:

**Fixes**:
There are two errors in the current implementation of `parse_prefix`:

Firstly, in the current implementation only `\` is recognized as a separator character in device namespace prefixes. This behavior is only correct for verbatim paths; `"\\.\C:/foo"` should be parsed as `"C:"` instead of `"C:/foo"`.

Secondly, the current implementation only handles single separator characters. In non-verbatim paths a series of separator characters should be recognized as a single boundary, e.g. the UNC path `"\\localhost\\\\\\C$\foo"` should be parsed as `"\\localhost\\\\\\C$"` and then `UNC(server: "localhost", share: "C$")`, but currently it is not parsed at all, because it starts being parsed as `\\localhost\` and then has an invalid empty share location.

Paths like `"\\.\C:/foo"` and `"\\localhost\\\\\\C$\foo"` are valid on Windows, they are equivalent to just `"C:\foo"`.

**Refactoring**:
All uses of `&[u8]` within `parse_prefix` are extracted to helper functions and`&OsStr` is used instead. This reduces the number of places unsafe is used:
- `get_first_two_components` is adapted to the more general `parse_next_component` and used in more places
- code for parsing drive prefixes is extracted to `parse_drive`
This commit is contained in:
bors 2020-12-16 00:47:50 +00:00
commit c00a4648a4
4 changed files with 140 additions and 83 deletions

View File

@ -667,10 +667,10 @@ impl OsStr {
/// Gets the underlying byte representation.
///
/// Note: it is *crucial* that this API is private, to avoid
/// Note: it is *crucial* that this API is not externally public, to avoid
/// revealing the internal, platform-specific encodings.
#[inline]
fn bytes(&self) -> &[u8] {
pub(crate) fn bytes(&self) -> &[u8] {
unsafe { &*(&self.inner as *const _ as *const [u8]) }
}

View File

@ -873,12 +873,12 @@ pub fn test_decompositions_windows() {
);
t!("\\\\.\\foo/bar",
iter: ["\\\\.\\foo/bar", "\\"],
iter: ["\\\\.\\foo", "\\", "bar"],
has_root: true,
is_absolute: true,
parent: None,
file_name: None,
file_stem: None,
parent: Some("\\\\.\\foo/"),
file_name: Some("bar"),
file_stem: Some("bar"),
extension: None
);

View File

@ -8,15 +8,12 @@ mod tests;
pub const MAIN_SEP_STR: &str = "\\";
pub const MAIN_SEP: char = '\\';
// The unsafety here stems from converting between `&OsStr` and `&[u8]`
// and back. This is safe to do because (1) we only look at ASCII
// contents of the encoding and (2) new &OsStr values are produced
// only from ASCII-bounded slices of existing &OsStr values.
fn os_str_as_u8_slice(s: &OsStr) -> &[u8] {
unsafe { mem::transmute(s) }
}
unsafe fn u8_slice_as_os_str(s: &[u8]) -> &OsStr {
mem::transmute(s)
// Safety: `bytes` must be a valid wtf8 encoded slice
#[inline]
unsafe fn bytes_as_os_str(bytes: &[u8]) -> &OsStr {
// &OsStr is layout compatible with &Slice, which is compatible with &Wtf8,
// which is compatible with &[u8].
mem::transmute(bytes)
}
#[inline]
@ -29,79 +26,116 @@ pub fn is_verbatim_sep(b: u8) -> bool {
b == b'\\'
}
// In most DOS systems, it is not possible to have more than 26 drive letters.
// See <https://en.wikipedia.org/wiki/Drive_letter_assignment#Common_assignments>.
pub fn is_valid_drive_letter(disk: u8) -> bool {
disk.is_ascii_alphabetic()
}
pub fn parse_prefix(path: &OsStr) -> Option<Prefix<'_>> {
use Prefix::{DeviceNS, Disk, Verbatim, VerbatimDisk, VerbatimUNC, UNC};
let path = os_str_as_u8_slice(path);
if let Some(path) = strip_prefix(path, r"\\") {
// \\
if let Some(path) = strip_prefix(path, r"?\") {
// \\?\
if let Some(path) = strip_prefix(path, r"UNC\") {
// \\?\UNC\server\share
// \\
if let Some(path) = path.strip_prefix(br"\\") {
// \\?\
if let Some(path) = path.strip_prefix(br"?\") {
// \\?\UNC\server\share
if let Some(path) = path.strip_prefix(br"UNC\") {
let (server, share) = match get_first_two_components(path, is_verbatim_sep) {
Some((server, share)) => unsafe {
(u8_slice_as_os_str(server), u8_slice_as_os_str(share))
},
None => (unsafe { u8_slice_as_os_str(path) }, OsStr::new("")),
};
return Some(VerbatimUNC(server, share));
let (server, path) = parse_next_component(path, true);
let (share, _) = parse_next_component(path, true);
Some(VerbatimUNC(server, share))
} else {
// \\?\path
match path {
// \\?\C:\path
[c, b':', b'\\', ..] if is_valid_drive_letter(*c) => {
return Some(VerbatimDisk(c.to_ascii_uppercase()));
}
// \\?\cat_pics
_ => {
let idx = path.iter().position(|&b| b == b'\\').unwrap_or(path.len());
let slice = &path[..idx];
return Some(Verbatim(unsafe { u8_slice_as_os_str(slice) }));
}
let (prefix, _) = parse_next_component(path, true);
// in verbatim paths only recognize an exact drive prefix
if let Some(drive) = parse_drive_exact(prefix) {
// \\?\C:
Some(VerbatimDisk(drive))
} else {
// \\?\prefix
Some(Verbatim(prefix))
}
}
} else if let Some(path) = path.strip_prefix(b".\\") {
} else if let Some(path) = strip_prefix(path, r".\") {
// \\.\COM42
let idx = path.iter().position(|&b| b == b'\\').unwrap_or(path.len());
let slice = &path[..idx];
return Some(DeviceNS(unsafe { u8_slice_as_os_str(slice) }));
}
match get_first_two_components(path, is_sep_byte) {
Some((server, share)) if !server.is_empty() && !share.is_empty() => {
let (prefix, _) = parse_next_component(path, false);
Some(DeviceNS(prefix))
} else {
let (server, path) = parse_next_component(path, false);
let (share, _) = parse_next_component(path, false);
if !server.is_empty() && !share.is_empty() {
// \\server\share
return Some(unsafe { UNC(u8_slice_as_os_str(server), u8_slice_as_os_str(share)) });
Some(UNC(server, share))
} else {
// no valid prefix beginning with "\\" recognized
None
}
_ => {}
}
} else if let [c, b':', ..] = path {
} else if let Some(drive) = parse_drive(path) {
// C:
if is_valid_drive_letter(*c) {
return Some(Disk(c.to_ascii_uppercase()));
}
Some(Disk(drive))
} else {
// no prefix
None
}
None
}
/// Returns the first two path components with predicate `f`.
///
/// The two components returned will be use by caller
/// to construct `VerbatimUNC` or `UNC` Windows path prefix.
///
/// Returns [`None`] if there are no separators in path.
fn get_first_two_components(path: &[u8], f: fn(u8) -> bool) -> Option<(&[u8], &[u8])> {
let idx = path.iter().position(|&x| f(x))?;
// Panic safe
// The max `idx+1` is `path.len()` and `path[path.len()..]` is a valid index.
let (first, path) = (&path[..idx], &path[idx + 1..]);
let idx = path.iter().position(|&x| f(x)).unwrap_or(path.len());
let second = &path[..idx];
Some((first, second))
// Parses a drive prefix, e.g. "C:" and "C:\whatever"
fn parse_drive(prefix: &OsStr) -> Option<u8> {
// In most DOS systems, it is not possible to have more than 26 drive letters.
// See <https://en.wikipedia.org/wiki/Drive_letter_assignment#Common_assignments>.
fn is_valid_drive_letter(drive: &u8) -> bool {
drive.is_ascii_alphabetic()
}
match prefix.bytes() {
[drive, b':', ..] if is_valid_drive_letter(drive) => Some(drive.to_ascii_uppercase()),
_ => None,
}
}
// Parses a drive prefix exactly, e.g. "C:"
fn parse_drive_exact(prefix: &OsStr) -> Option<u8> {
// only parse two bytes: the drive letter and the drive separator
if prefix.len() == 2 { parse_drive(prefix) } else { None }
}
fn strip_prefix<'a>(path: &'a OsStr, prefix: &str) -> Option<&'a OsStr> {
// `path` and `prefix` are valid wtf8 and utf8 encoded slices respectively, `path[prefix.len()]`
// is thus a code point boundary and `path[prefix.len()..]` is a valid wtf8 encoded slice.
match path.bytes().strip_prefix(prefix.as_bytes()) {
Some(path) => unsafe { Some(bytes_as_os_str(path)) },
None => None,
}
}
// Parse the next path component.
//
// Returns the next component and the rest of the path excluding the component and separator.
// Does not recognize `/` as a separator character if `verbatim` is true.
fn parse_next_component(path: &OsStr, verbatim: bool) -> (&OsStr, &OsStr) {
let separator = if verbatim { is_verbatim_sep } else { is_sep_byte };
match path.bytes().iter().position(|&x| separator(x)) {
Some(separator_start) => {
let mut separator_end = separator_start + 1;
// a series of multiple separator characters is treated as a single separator,
// except in verbatim paths
while !verbatim && separator_end < path.len() && separator(path.bytes()[separator_end])
{
separator_end += 1;
}
let component = &path.bytes()[..separator_start];
// Panic safe
// The max `separator_end` is `bytes.len()` and `bytes[bytes.len()..]` is a valid index.
let path = &path.bytes()[separator_end..];
// Safety: `path` is a valid wtf8 encoded slice and each of the separators ('/', '\')
// is encoded in a single byte, therefore `bytes[separator_start]` and
// `bytes[separator_end]` must be code point boundaries and thus
// `bytes[..separator_start]` and `bytes[separator_end..]` are valid wtf8 slices.
unsafe { (bytes_as_os_str(component), bytes_as_os_str(path)) }
}
None => (path, OsStr::new("")),
}
}

View File

@ -1,21 +1,44 @@
use super::*;
#[test]
fn test_get_first_two_components() {
fn test_parse_next_component() {
assert_eq!(
get_first_two_components(br"server\share", is_verbatim_sep),
Some((&b"server"[..], &b"share"[..])),
parse_next_component(OsStr::new(r"server\share"), true),
(OsStr::new(r"server"), OsStr::new(r"share"))
);
assert_eq!(
get_first_two_components(br"server\", is_verbatim_sep),
Some((&b"server"[..], &b""[..]))
parse_next_component(OsStr::new(r"server/share"), true),
(OsStr::new(r"server/share"), OsStr::new(r""))
);
assert_eq!(
get_first_two_components(br"\server\", is_verbatim_sep),
Some((&b""[..], &b"server"[..]))
parse_next_component(OsStr::new(r"server/share"), false),
(OsStr::new(r"server"), OsStr::new(r"share"))
);
assert_eq!(get_first_two_components(br"there are no separators here", is_verbatim_sep), None,);
assert_eq!(
parse_next_component(OsStr::new(r"server\"), false),
(OsStr::new(r"server"), OsStr::new(r""))
);
assert_eq!(
parse_next_component(OsStr::new(r"\server\"), false),
(OsStr::new(r""), OsStr::new(r"server\"))
);
assert_eq!(
parse_next_component(OsStr::new(r"servershare"), false),
(OsStr::new(r"servershare"), OsStr::new(""))
);
assert_eq!(
parse_next_component(OsStr::new(r"server/\//\/\\\\/////\/share"), false),
(OsStr::new(r"server"), OsStr::new(r"share"))
);
assert_eq!(
parse_next_component(OsStr::new(r"server\\\\\\\\\\\\\\share"), true),
(OsStr::new(r"server"), OsStr::new(r"\\\\\\\\\\\\\share"))
);
}