Auto merge of #78833 - CDirkx:parse_prefix, r=dtolnay
Refactor and fix `parse_prefix` on Windows This PR is an extension of #78692 as well as a general refactor of `parse_prefix`: **Fixes**: There are two errors in the current implementation of `parse_prefix`: Firstly, in the current implementation only `\` is recognized as a separator character in device namespace prefixes. This behavior is only correct for verbatim paths; `"\\.\C:/foo"` should be parsed as `"C:"` instead of `"C:/foo"`. Secondly, the current implementation only handles single separator characters. In non-verbatim paths a series of separator characters should be recognized as a single boundary, e.g. the UNC path `"\\localhost\\\\\\C$\foo"` should be parsed as `"\\localhost\\\\\\C$"` and then `UNC(server: "localhost", share: "C$")`, but currently it is not parsed at all, because it starts being parsed as `\\localhost\` and then has an invalid empty share location. Paths like `"\\.\C:/foo"` and `"\\localhost\\\\\\C$\foo"` are valid on Windows, they are equivalent to just `"C:\foo"`. **Refactoring**: All uses of `&[u8]` within `parse_prefix` are extracted to helper functions and`&OsStr` is used instead. This reduces the number of places unsafe is used: - `get_first_two_components` is adapted to the more general `parse_next_component` and used in more places - code for parsing drive prefixes is extracted to `parse_drive`
This commit is contained in:
commit
c00a4648a4
@ -667,10 +667,10 @@ impl OsStr {
|
||||
|
||||
/// Gets the underlying byte representation.
|
||||
///
|
||||
/// Note: it is *crucial* that this API is private, to avoid
|
||||
/// Note: it is *crucial* that this API is not externally public, to avoid
|
||||
/// revealing the internal, platform-specific encodings.
|
||||
#[inline]
|
||||
fn bytes(&self) -> &[u8] {
|
||||
pub(crate) fn bytes(&self) -> &[u8] {
|
||||
unsafe { &*(&self.inner as *const _ as *const [u8]) }
|
||||
}
|
||||
|
||||
|
@ -873,12 +873,12 @@ pub fn test_decompositions_windows() {
|
||||
);
|
||||
|
||||
t!("\\\\.\\foo/bar",
|
||||
iter: ["\\\\.\\foo/bar", "\\"],
|
||||
iter: ["\\\\.\\foo", "\\", "bar"],
|
||||
has_root: true,
|
||||
is_absolute: true,
|
||||
parent: None,
|
||||
file_name: None,
|
||||
file_stem: None,
|
||||
parent: Some("\\\\.\\foo/"),
|
||||
file_name: Some("bar"),
|
||||
file_stem: Some("bar"),
|
||||
extension: None
|
||||
);
|
||||
|
||||
|
@ -8,15 +8,12 @@ mod tests;
|
||||
pub const MAIN_SEP_STR: &str = "\\";
|
||||
pub const MAIN_SEP: char = '\\';
|
||||
|
||||
// The unsafety here stems from converting between `&OsStr` and `&[u8]`
|
||||
// and back. This is safe to do because (1) we only look at ASCII
|
||||
// contents of the encoding and (2) new &OsStr values are produced
|
||||
// only from ASCII-bounded slices of existing &OsStr values.
|
||||
fn os_str_as_u8_slice(s: &OsStr) -> &[u8] {
|
||||
unsafe { mem::transmute(s) }
|
||||
}
|
||||
unsafe fn u8_slice_as_os_str(s: &[u8]) -> &OsStr {
|
||||
mem::transmute(s)
|
||||
// Safety: `bytes` must be a valid wtf8 encoded slice
|
||||
#[inline]
|
||||
unsafe fn bytes_as_os_str(bytes: &[u8]) -> &OsStr {
|
||||
// &OsStr is layout compatible with &Slice, which is compatible with &Wtf8,
|
||||
// which is compatible with &[u8].
|
||||
mem::transmute(bytes)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@ -29,79 +26,116 @@ pub fn is_verbatim_sep(b: u8) -> bool {
|
||||
b == b'\\'
|
||||
}
|
||||
|
||||
// In most DOS systems, it is not possible to have more than 26 drive letters.
|
||||
// See <https://en.wikipedia.org/wiki/Drive_letter_assignment#Common_assignments>.
|
||||
pub fn is_valid_drive_letter(disk: u8) -> bool {
|
||||
disk.is_ascii_alphabetic()
|
||||
}
|
||||
|
||||
pub fn parse_prefix(path: &OsStr) -> Option<Prefix<'_>> {
|
||||
use Prefix::{DeviceNS, Disk, Verbatim, VerbatimDisk, VerbatimUNC, UNC};
|
||||
|
||||
let path = os_str_as_u8_slice(path);
|
||||
if let Some(path) = strip_prefix(path, r"\\") {
|
||||
// \\
|
||||
if let Some(path) = strip_prefix(path, r"?\") {
|
||||
// \\?\
|
||||
if let Some(path) = strip_prefix(path, r"UNC\") {
|
||||
// \\?\UNC\server\share
|
||||
|
||||
// \\
|
||||
if let Some(path) = path.strip_prefix(br"\\") {
|
||||
// \\?\
|
||||
if let Some(path) = path.strip_prefix(br"?\") {
|
||||
// \\?\UNC\server\share
|
||||
if let Some(path) = path.strip_prefix(br"UNC\") {
|
||||
let (server, share) = match get_first_two_components(path, is_verbatim_sep) {
|
||||
Some((server, share)) => unsafe {
|
||||
(u8_slice_as_os_str(server), u8_slice_as_os_str(share))
|
||||
},
|
||||
None => (unsafe { u8_slice_as_os_str(path) }, OsStr::new("")),
|
||||
};
|
||||
return Some(VerbatimUNC(server, share));
|
||||
let (server, path) = parse_next_component(path, true);
|
||||
let (share, _) = parse_next_component(path, true);
|
||||
|
||||
Some(VerbatimUNC(server, share))
|
||||
} else {
|
||||
// \\?\path
|
||||
match path {
|
||||
// \\?\C:\path
|
||||
[c, b':', b'\\', ..] if is_valid_drive_letter(*c) => {
|
||||
return Some(VerbatimDisk(c.to_ascii_uppercase()));
|
||||
}
|
||||
// \\?\cat_pics
|
||||
_ => {
|
||||
let idx = path.iter().position(|&b| b == b'\\').unwrap_or(path.len());
|
||||
let slice = &path[..idx];
|
||||
return Some(Verbatim(unsafe { u8_slice_as_os_str(slice) }));
|
||||
}
|
||||
let (prefix, _) = parse_next_component(path, true);
|
||||
|
||||
// in verbatim paths only recognize an exact drive prefix
|
||||
if let Some(drive) = parse_drive_exact(prefix) {
|
||||
// \\?\C:
|
||||
Some(VerbatimDisk(drive))
|
||||
} else {
|
||||
// \\?\prefix
|
||||
Some(Verbatim(prefix))
|
||||
}
|
||||
}
|
||||
} else if let Some(path) = path.strip_prefix(b".\\") {
|
||||
} else if let Some(path) = strip_prefix(path, r".\") {
|
||||
// \\.\COM42
|
||||
let idx = path.iter().position(|&b| b == b'\\').unwrap_or(path.len());
|
||||
let slice = &path[..idx];
|
||||
return Some(DeviceNS(unsafe { u8_slice_as_os_str(slice) }));
|
||||
}
|
||||
match get_first_two_components(path, is_sep_byte) {
|
||||
Some((server, share)) if !server.is_empty() && !share.is_empty() => {
|
||||
let (prefix, _) = parse_next_component(path, false);
|
||||
Some(DeviceNS(prefix))
|
||||
} else {
|
||||
let (server, path) = parse_next_component(path, false);
|
||||
let (share, _) = parse_next_component(path, false);
|
||||
|
||||
if !server.is_empty() && !share.is_empty() {
|
||||
// \\server\share
|
||||
return Some(unsafe { UNC(u8_slice_as_os_str(server), u8_slice_as_os_str(share)) });
|
||||
Some(UNC(server, share))
|
||||
} else {
|
||||
// no valid prefix beginning with "\\" recognized
|
||||
None
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
} else if let [c, b':', ..] = path {
|
||||
} else if let Some(drive) = parse_drive(path) {
|
||||
// C:
|
||||
if is_valid_drive_letter(*c) {
|
||||
return Some(Disk(c.to_ascii_uppercase()));
|
||||
}
|
||||
Some(Disk(drive))
|
||||
} else {
|
||||
// no prefix
|
||||
None
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Returns the first two path components with predicate `f`.
|
||||
///
|
||||
/// The two components returned will be use by caller
|
||||
/// to construct `VerbatimUNC` or `UNC` Windows path prefix.
|
||||
///
|
||||
/// Returns [`None`] if there are no separators in path.
|
||||
fn get_first_two_components(path: &[u8], f: fn(u8) -> bool) -> Option<(&[u8], &[u8])> {
|
||||
let idx = path.iter().position(|&x| f(x))?;
|
||||
// Panic safe
|
||||
// The max `idx+1` is `path.len()` and `path[path.len()..]` is a valid index.
|
||||
let (first, path) = (&path[..idx], &path[idx + 1..]);
|
||||
let idx = path.iter().position(|&x| f(x)).unwrap_or(path.len());
|
||||
let second = &path[..idx];
|
||||
Some((first, second))
|
||||
// Parses a drive prefix, e.g. "C:" and "C:\whatever"
|
||||
fn parse_drive(prefix: &OsStr) -> Option<u8> {
|
||||
// In most DOS systems, it is not possible to have more than 26 drive letters.
|
||||
// See <https://en.wikipedia.org/wiki/Drive_letter_assignment#Common_assignments>.
|
||||
fn is_valid_drive_letter(drive: &u8) -> bool {
|
||||
drive.is_ascii_alphabetic()
|
||||
}
|
||||
|
||||
match prefix.bytes() {
|
||||
[drive, b':', ..] if is_valid_drive_letter(drive) => Some(drive.to_ascii_uppercase()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
// Parses a drive prefix exactly, e.g. "C:"
|
||||
fn parse_drive_exact(prefix: &OsStr) -> Option<u8> {
|
||||
// only parse two bytes: the drive letter and the drive separator
|
||||
if prefix.len() == 2 { parse_drive(prefix) } else { None }
|
||||
}
|
||||
|
||||
fn strip_prefix<'a>(path: &'a OsStr, prefix: &str) -> Option<&'a OsStr> {
|
||||
// `path` and `prefix` are valid wtf8 and utf8 encoded slices respectively, `path[prefix.len()]`
|
||||
// is thus a code point boundary and `path[prefix.len()..]` is a valid wtf8 encoded slice.
|
||||
match path.bytes().strip_prefix(prefix.as_bytes()) {
|
||||
Some(path) => unsafe { Some(bytes_as_os_str(path)) },
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
// Parse the next path component.
|
||||
//
|
||||
// Returns the next component and the rest of the path excluding the component and separator.
|
||||
// Does not recognize `/` as a separator character if `verbatim` is true.
|
||||
fn parse_next_component(path: &OsStr, verbatim: bool) -> (&OsStr, &OsStr) {
|
||||
let separator = if verbatim { is_verbatim_sep } else { is_sep_byte };
|
||||
|
||||
match path.bytes().iter().position(|&x| separator(x)) {
|
||||
Some(separator_start) => {
|
||||
let mut separator_end = separator_start + 1;
|
||||
|
||||
// a series of multiple separator characters is treated as a single separator,
|
||||
// except in verbatim paths
|
||||
while !verbatim && separator_end < path.len() && separator(path.bytes()[separator_end])
|
||||
{
|
||||
separator_end += 1;
|
||||
}
|
||||
|
||||
let component = &path.bytes()[..separator_start];
|
||||
|
||||
// Panic safe
|
||||
// The max `separator_end` is `bytes.len()` and `bytes[bytes.len()..]` is a valid index.
|
||||
let path = &path.bytes()[separator_end..];
|
||||
|
||||
// Safety: `path` is a valid wtf8 encoded slice and each of the separators ('/', '\')
|
||||
// is encoded in a single byte, therefore `bytes[separator_start]` and
|
||||
// `bytes[separator_end]` must be code point boundaries and thus
|
||||
// `bytes[..separator_start]` and `bytes[separator_end..]` are valid wtf8 slices.
|
||||
unsafe { (bytes_as_os_str(component), bytes_as_os_str(path)) }
|
||||
}
|
||||
None => (path, OsStr::new("")),
|
||||
}
|
||||
}
|
||||
|
@ -1,21 +1,44 @@
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_get_first_two_components() {
|
||||
fn test_parse_next_component() {
|
||||
assert_eq!(
|
||||
get_first_two_components(br"server\share", is_verbatim_sep),
|
||||
Some((&b"server"[..], &b"share"[..])),
|
||||
parse_next_component(OsStr::new(r"server\share"), true),
|
||||
(OsStr::new(r"server"), OsStr::new(r"share"))
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
get_first_two_components(br"server\", is_verbatim_sep),
|
||||
Some((&b"server"[..], &b""[..]))
|
||||
parse_next_component(OsStr::new(r"server/share"), true),
|
||||
(OsStr::new(r"server/share"), OsStr::new(r""))
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
get_first_two_components(br"\server\", is_verbatim_sep),
|
||||
Some((&b""[..], &b"server"[..]))
|
||||
parse_next_component(OsStr::new(r"server/share"), false),
|
||||
(OsStr::new(r"server"), OsStr::new(r"share"))
|
||||
);
|
||||
|
||||
assert_eq!(get_first_two_components(br"there are no separators here", is_verbatim_sep), None,);
|
||||
assert_eq!(
|
||||
parse_next_component(OsStr::new(r"server\"), false),
|
||||
(OsStr::new(r"server"), OsStr::new(r""))
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
parse_next_component(OsStr::new(r"\server\"), false),
|
||||
(OsStr::new(r""), OsStr::new(r"server\"))
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
parse_next_component(OsStr::new(r"servershare"), false),
|
||||
(OsStr::new(r"servershare"), OsStr::new(""))
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
parse_next_component(OsStr::new(r"server/\//\/\\\\/////\/share"), false),
|
||||
(OsStr::new(r"server"), OsStr::new(r"share"))
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
parse_next_component(OsStr::new(r"server\\\\\\\\\\\\\\share"), true),
|
||||
(OsStr::new(r"server"), OsStr::new(r"\\\\\\\\\\\\\share"))
|
||||
);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user