From 3d84b4be3d7691026993d5c733bc26cc637e7c50 Mon Sep 17 00:00:00 2001 From: John Schmidt Date: Sat, 31 May 2014 13:02:29 +0200 Subject: [PATCH] Add `utf16_units` This deprecates `.to_utf16`. `x.to_utf16()` should be replaced by either `x.utf16_units().collect::>()` (the type annotation may be optional), or just `x.utf16_units()` directly, if it can be used in an iterator context. Closes #14358 [breaking-change] --- src/libcollections/str.rs | 23 ++++++++--------- src/libcore/str.rs | 46 +++++++++++++++++++++++++++++++++- src/libnative/io/c_win32.rs | 4 ++- src/libnative/io/file_win32.rs | 2 +- src/libnative/io/process.rs | 14 ++++++++--- src/librustdoc/flock.rs | 3 ++- src/libstd/dynamic_lib.rs | 7 ++++-- src/libstd/os.rs | 14 +++++++---- 8 files changed, 85 insertions(+), 28 deletions(-) diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs index 72c5aff675b..e07281523f4 100644 --- a/src/libcollections/str.rs +++ b/src/libcollections/str.rs @@ -803,15 +803,9 @@ pub trait StrAllocating: Str { } /// Converts to a vector of `u16` encoded as UTF-16. + #[deprecated = "use `utf16_units` instead"] fn to_utf16(&self) -> Vec { - let me = self.as_slice(); - let mut u = Vec::new(); - for ch in me.chars() { - let mut buf = [0u16, ..2]; - let n = ch.encode_utf16(buf /* as mut slice! */); - u.push_all(buf.slice_to(n)); - } - u + self.as_slice().utf16_units().collect::>() } /// Given a string, make a new string with repeated copies of it. @@ -1619,14 +1613,17 @@ mod tests { for p in pairs.iter() { let (s, u) = (*p).clone(); - assert!(is_utf16(u.as_slice())); - assert_eq!(s.to_utf16(), u); + let s_as_utf16 = s.as_slice().utf16_units().collect::>(); + let u_as_string = from_utf16(u.as_slice()).unwrap(); - assert_eq!(from_utf16(u.as_slice()).unwrap(), s); + assert!(is_utf16(u.as_slice())); + assert_eq!(s_as_utf16, u); + + assert_eq!(u_as_string, s); assert_eq!(from_utf16_lossy(u.as_slice()), s); - assert_eq!(from_utf16(s.to_utf16().as_slice()).unwrap(), s); - assert_eq!(from_utf16(u.as_slice()).unwrap().to_utf16(), u); + assert_eq!(from_utf16(s_as_utf16.as_slice()).unwrap(), s); + assert_eq!(u_as_string.as_slice().utf16_units().collect::>(), u); } } diff --git a/src/libcore/str.rs b/src/libcore/str.rs index 0d4b5f59074..e7174944a21 100644 --- a/src/libcore/str.rs +++ b/src/libcore/str.rs @@ -16,6 +16,7 @@ use mem; use char; +use char::Char; use clone::Clone; use cmp; use cmp::{PartialEq, Eq}; @@ -24,7 +25,7 @@ use default::Default; use iter::{Filter, Map, Iterator}; use iter::{DoubleEndedIterator, ExactSize}; use iter::range; -use num::Saturating; +use num::{CheckedMul, Saturating}; use option::{None, Option, Some}; use raw::Repr; use slice::ImmutableVector; @@ -557,6 +558,41 @@ impl<'a> Iterator<&'a str> for StrSplits<'a> { } } +/// External iterator for a string's UTF16 codeunits. +/// Use with the `std::iter` module. +#[deriving(Clone)] +pub struct Utf16CodeUnits<'a> { + chars: Chars<'a>, + extra: u16 +} + +impl<'a> Iterator for Utf16CodeUnits<'a> { + #[inline] + fn next(&mut self) -> Option { + if self.extra != 0 { + let tmp = self.extra; + self.extra = 0; + return Some(tmp); + } + + let mut buf = [0u16, ..2]; + self.chars.next().map(|ch| { + let n = ch.encode_utf16(buf /* as mut slice! */); + if n == 2 { self.extra = buf[1]; } + buf[0] + }) + } + + #[inline] + fn size_hint(&self) -> (uint, Option) { + let (low, high) = self.chars.size_hint(); + // every char gets either one u16 or two u16, + // so this iterator is between 1 or 2 times as + // long as the underlying iterator. + (low, high.and_then(|n| n.checked_mul(&2))) + } +} + /* Section: Comparing strings */ @@ -1619,6 +1655,9 @@ pub trait StrSlice<'a> { /// and that it is not reallocated (e.g. by pushing to the /// string). fn as_ptr(&self) -> *const u8; + + /// Return an iterator of `u16` over the string encoded as UTF-16. + fn utf16_units(&self) -> Utf16CodeUnits<'a>; } impl<'a> StrSlice<'a> for &'a str { @@ -1967,6 +2006,11 @@ impl<'a> StrSlice<'a> for &'a str { fn as_ptr(&self) -> *const u8 { self.repr().data } + + #[inline] + fn utf16_units(&self) -> Utf16CodeUnits<'a> { + Utf16CodeUnits{ chars: self.chars(), extra: 0} + } } impl<'a> Default for &'a str { diff --git a/src/libnative/io/c_win32.rs b/src/libnative/io/c_win32.rs index 7e58102d241..802526c9196 100644 --- a/src/libnative/io/c_win32.rs +++ b/src/libnative/io/c_win32.rs @@ -70,6 +70,7 @@ extern "system" { pub mod compat { use std::intrinsics::{atomic_store_relaxed, transmute}; + use std::iter::Iterator; use libc::types::os::arch::extra::{LPCWSTR, HMODULE, LPCSTR, LPVOID}; extern "system" { @@ -82,7 +83,8 @@ pub mod compat { // layer (after it's loaded) shouldn't be any slower than a regular DLL // call. unsafe fn store_func(ptr: *mut uint, module: &str, symbol: &str, fallback: uint) { - let module = module.to_utf16().append_one(0); + let module: Vec = module.utf16_units().collect(); + let module = module.append_one(0); symbol.with_c_str(|symbol| { let handle = GetModuleHandleW(module.as_ptr()); let func: uint = transmute(GetProcAddress(handle, symbol)); diff --git a/src/libnative/io/file_win32.rs b/src/libnative/io/file_win32.rs index 3195fa4f2d4..98553603313 100644 --- a/src/libnative/io/file_win32.rs +++ b/src/libnative/io/file_win32.rs @@ -255,7 +255,7 @@ impl Drop for Inner { pub fn to_utf16(s: &CString) -> IoResult> { match s.as_str() { - Some(s) => Ok(s.to_utf16().append_one(0)), + Some(s) => Ok(s.utf16_units().collect::>().append_one(0)), None => Err(IoError { code: libc::ERROR_INVALID_NAME as uint, extra: 0, diff --git a/src/libnative/io/process.rs b/src/libnative/io/process.rs index 00448b91dbd..3d248e159a2 100644 --- a/src/libnative/io/process.rs +++ b/src/libnative/io/process.rs @@ -294,6 +294,8 @@ fn spawn_process_os(cfg: ProcessConfig, use libc::funcs::extra::msvcrt::get_osfhandle; use std::mem; + use std::iter::Iterator; + use std::str::StrSlice; if cfg.gid.is_some() || cfg.uid.is_some() { return Err(IoError { @@ -328,7 +330,8 @@ fn spawn_process_os(cfg: ProcessConfig, lpSecurityDescriptor: ptr::mut_null(), bInheritHandle: 1, }; - let filename = "NUL".to_utf16().append_one(0); + let filename: Vec = "NUL".utf16_units().collect(); + let filename = filename.append_one(0); *slot = libc::CreateFileW(filename.as_ptr(), access, libc::FILE_SHARE_READ | @@ -371,7 +374,8 @@ fn spawn_process_os(cfg: ProcessConfig, with_envp(cfg.env, |envp| { with_dirp(cfg.cwd, |dirp| { - let mut cmd_str = cmd_str.to_utf16().append_one(0); + let mut cmd_str: Vec = cmd_str.as_slice().utf16_units().collect(); + cmd_str = cmd_str.append_one(0); let created = CreateProcessW(ptr::null(), cmd_str.as_mut_ptr(), ptr::mut_null(), @@ -770,7 +774,7 @@ fn with_envp(env: Option<&[(CString, CString)]>, cb: |*mut c_void| -> T) -> T let kv = format!("{}={}", pair.ref0().as_str().unwrap(), pair.ref1().as_str().unwrap()); - blk.push_all(kv.to_utf16().as_slice()); + blk.extend(kv.as_slice().utf16_units()); blk.push(0); } @@ -788,7 +792,9 @@ fn with_dirp(d: Option<&CString>, cb: |*const u16| -> T) -> T { Some(dir) => { let dir_str = dir.as_str() .expect("expected workingdirectory to be utf-8 encoded"); - let dir_str = dir_str.to_utf16().append_one(0); + let dir_str: Vec = dir_str.utf16_units().collect(); + let dir_str = dir_str.append_one(0); + cb(dir_str.as_ptr()) }, None => cb(ptr::null()) diff --git a/src/librustdoc/flock.rs b/src/librustdoc/flock.rs index cb2ebd15b39..f07c0163676 100644 --- a/src/librustdoc/flock.rs +++ b/src/librustdoc/flock.rs @@ -162,7 +162,8 @@ mod imp { impl Lock { pub fn new(p: &Path) -> Lock { - let p_16 = p.as_str().unwrap().to_utf16().append_one(0); + let p_16: Vec = p.as_str().unwrap().utf16_units().collect(); + let p_16 = p_16.append_one(0); let handle = unsafe { libc::CreateFileW(p_16.as_ptr(), libc::FILE_GENERIC_READ | diff --git a/src/libstd/dynamic_lib.rs b/src/libstd/dynamic_lib.rs index ec2cc67a60a..728875ce260 100644 --- a/src/libstd/dynamic_lib.rs +++ b/src/libstd/dynamic_lib.rs @@ -281,19 +281,22 @@ pub mod dl { #[cfg(target_os = "win32")] pub mod dl { use c_str::ToCStr; + use iter::Iterator; use libc; use os; use ptr; use result::{Ok, Err, Result}; - use str::StrAllocating; + use str::StrSlice; use str; use string::String; + use vec::Vec; pub unsafe fn open_external(filename: T) -> *mut u8 { // Windows expects Unicode data let filename_cstr = filename.to_c_str(); let filename_str = str::from_utf8(filename_cstr.as_bytes_no_nul()).unwrap(); - let filename_str = filename_str.to_utf16().append_one(0); + let filename_str: Vec = filename_str.utf16_units().collect(); + let filename_str = filename_str.append_one(0); LoadLibraryW(filename_str.as_ptr() as *const libc::c_void) as *mut u8 } diff --git a/src/libstd/os.rs b/src/libstd/os.rs index 90f4cbb2577..be3b6be57b3 100644 --- a/src/libstd/os.rs +++ b/src/libstd/os.rs @@ -365,7 +365,8 @@ pub fn getenv(n: &str) -> Option { unsafe { with_env_lock(|| { use os::win32::{fill_utf16_buf_and_decode}; - let n = n.to_utf16().append_one(0); + let n: Vec = n.utf16_units().collect(); + let n = n.append_one(0); fill_utf16_buf_and_decode(|buf, sz| { libc::GetEnvironmentVariableW(n.as_ptr(), buf, sz) }) @@ -411,8 +412,10 @@ pub fn setenv(n: &str, v: &str) { #[cfg(windows)] fn _setenv(n: &str, v: &str) { - let n = n.to_utf16().append_one(0); - let v = v.to_utf16().append_one(0); + let n: Vec = n.utf16_units().collect(); + let n = n.append_one(0); + let v: Vec = v.utf16_units().collect(); + let v = v.append_one(0); unsafe { with_env_lock(|| { libc::SetEnvironmentVariableW(n.as_ptr(), v.as_ptr()); @@ -437,7 +440,8 @@ pub fn unsetenv(n: &str) { #[cfg(windows)] fn _unsetenv(n: &str) { - let n = n.to_utf16().append_one(0); + let n: Vec = n.utf16_units().collect(); + let n = n.append_one(0); unsafe { with_env_lock(|| { libc::SetEnvironmentVariableW(n.as_ptr(), ptr::null()); @@ -804,7 +808,7 @@ pub fn change_dir(p: &Path) -> bool { #[cfg(windows)] fn chdir(p: &Path) -> bool { let p = match p.as_str() { - Some(s) => s.to_utf16().append_one(0), + Some(s) => s.utf16_units().collect::>().append_one(0), None => return false, }; unsafe {