Implement modern Windows arg parsing
As derived from extensive testing of `argv` in a C/C++ application. Co-Authored-By: Jane Lusby <jlusby42@gmail.com>
This commit is contained in:
parent
565a51973a
commit
e26dda5642
@ -253,6 +253,7 @@
|
||||
#![feature(const_ip)]
|
||||
#![feature(const_ipv4)]
|
||||
#![feature(const_ipv6)]
|
||||
#![feature(const_option)]
|
||||
#![feature(const_raw_ptr_deref)]
|
||||
#![feature(const_socketaddr)]
|
||||
#![feature(container_error_extra)]
|
||||
|
@ -1,13 +1,18 @@
|
||||
#![allow(dead_code)] // runtime init functions not used during testing
|
||||
//! The Windows command line is just a string
|
||||
//! <https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string>
|
||||
//!
|
||||
//! This module implements the parsing necessary to turn that string into a list of arguments.
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use crate::ffi::OsString;
|
||||
use crate::fmt;
|
||||
use crate::marker::PhantomData;
|
||||
use crate::num::NonZeroU16;
|
||||
use crate::os::windows::prelude::*;
|
||||
use crate::path::PathBuf;
|
||||
use crate::slice;
|
||||
use crate::ptr::NonNull;
|
||||
use crate::sys::c;
|
||||
use crate::sys::windows::os::current_exe;
|
||||
use crate::vec;
|
||||
@ -15,9 +20,11 @@ use crate::vec;
|
||||
use core::iter;
|
||||
|
||||
pub fn args() -> Args {
|
||||
// SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16
|
||||
// string so it's safe for `WStrUnits` to use.
|
||||
unsafe {
|
||||
let lp_cmd_line = c::GetCommandLineW();
|
||||
let parsed_args_list = parse_lp_cmd_line(lp_cmd_line as *const u16, || {
|
||||
let parsed_args_list = parse_lp_cmd_line(WStrUnits::new(lp_cmd_line), || {
|
||||
current_exe().map(PathBuf::into_os_string).unwrap_or_else(|_| OsString::new())
|
||||
});
|
||||
|
||||
@ -28,129 +35,120 @@ pub fn args() -> Args {
|
||||
/// Implements the Windows command-line argument parsing algorithm.
|
||||
///
|
||||
/// Microsoft's documentation for the Windows CLI argument format can be found at
|
||||
/// <https://docs.microsoft.com/en-us/previous-versions//17w5ykft(v=vs.85)>.
|
||||
/// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments>
|
||||
///
|
||||
/// Windows includes a function to do this in shell32.dll,
|
||||
/// but linking with that DLL causes the process to be registered as a GUI application.
|
||||
/// A more in-depth explanation is here:
|
||||
/// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN>
|
||||
///
|
||||
/// Windows includes a function to do command line parsing in shell32.dll.
|
||||
/// However, this is not used for two reasons:
|
||||
///
|
||||
/// 1. Linking with that DLL causes the process to be registered as a GUI application.
|
||||
/// GUI applications add a bunch of overhead, even if no windows are drawn. See
|
||||
/// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>.
|
||||
///
|
||||
/// This function was tested for equivalence to the shell32.dll implementation in
|
||||
/// Windows 10 Pro v1803, using an exhaustive test suite available at
|
||||
/// <https://gist.github.com/notriddle/dde431930c392e428055b2dc22e638f5> or
|
||||
/// <https://paste.gg/p/anonymous/47d6ed5f5bd549168b1c69c799825223>.
|
||||
unsafe fn parse_lp_cmd_line<F: Fn() -> OsString>(
|
||||
lp_cmd_line: *const u16,
|
||||
/// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above.
|
||||
///
|
||||
/// This function was tested for equivalence to the C/C++ parsing rules using an
|
||||
/// extensive test suite available at
|
||||
/// <https://github.com/ChrisDenton/winarg/tree/std>.
|
||||
fn parse_lp_cmd_line<'a, F: Fn() -> OsString>(
|
||||
lp_cmd_line: Option<WStrUnits<'a>>,
|
||||
exe_name: F,
|
||||
) -> Vec<OsString> {
|
||||
const BACKSLASH: u16 = '\\' as u16;
|
||||
const QUOTE: u16 = '"' as u16;
|
||||
const TAB: u16 = '\t' as u16;
|
||||
const SPACE: u16 = ' ' as u16;
|
||||
const BACKSLASH: NonZeroU16 = NonZeroU16::new(b'\\' as u16).unwrap();
|
||||
const QUOTE: NonZeroU16 = NonZeroU16::new(b'"' as u16).unwrap();
|
||||
const TAB: NonZeroU16 = NonZeroU16::new(b'\t' as u16).unwrap();
|
||||
const SPACE: NonZeroU16 = NonZeroU16::new(b' ' as u16).unwrap();
|
||||
|
||||
let mut ret_val = Vec::new();
|
||||
if lp_cmd_line.is_null() || *lp_cmd_line == 0 {
|
||||
// If the cmd line pointer is null or it points to an empty string then
|
||||
// return the name of the executable as argv[0].
|
||||
if lp_cmd_line.as_ref().and_then(|cmd| cmd.peek()).is_none() {
|
||||
ret_val.push(exe_name());
|
||||
return ret_val;
|
||||
}
|
||||
let mut cmd_line = {
|
||||
let mut end = 0;
|
||||
while *lp_cmd_line.offset(end) != 0 {
|
||||
end += 1;
|
||||
}
|
||||
slice::from_raw_parts(lp_cmd_line, end as usize)
|
||||
};
|
||||
let mut code_units = lp_cmd_line.unwrap();
|
||||
|
||||
// The executable name at the beginning is special.
|
||||
cmd_line = match cmd_line[0] {
|
||||
// The executable name ends at the next quote mark,
|
||||
// no matter what.
|
||||
QUOTE => {
|
||||
let args = {
|
||||
let mut cut = cmd_line[1..].splitn(2, |&c| c == QUOTE);
|
||||
if let Some(exe) = cut.next() {
|
||||
ret_val.push(OsString::from_wide(exe));
|
||||
}
|
||||
cut.next()
|
||||
};
|
||||
if let Some(args) = args {
|
||||
args
|
||||
} else {
|
||||
return ret_val;
|
||||
}
|
||||
}
|
||||
// Implement quirk: when they say whitespace here,
|
||||
// they include the entire ASCII control plane:
|
||||
// "However, if lpCmdLine starts with any amount of whitespace, CommandLineToArgvW
|
||||
// will consider the first argument to be an empty string. Excess whitespace at the
|
||||
// end of lpCmdLine is ignored."
|
||||
0..=SPACE => {
|
||||
ret_val.push(OsString::new());
|
||||
&cmd_line[1..]
|
||||
}
|
||||
// The executable name ends at the next whitespace,
|
||||
// no matter what.
|
||||
_ => {
|
||||
let args = {
|
||||
let mut cut = cmd_line.splitn(2, |&c| c > 0 && c <= SPACE);
|
||||
if let Some(exe) = cut.next() {
|
||||
ret_val.push(OsString::from_wide(exe));
|
||||
}
|
||||
cut.next()
|
||||
};
|
||||
if let Some(args) = args {
|
||||
args
|
||||
} else {
|
||||
return ret_val;
|
||||
}
|
||||
}
|
||||
};
|
||||
let mut cur = Vec::new();
|
||||
let mut in_quotes = false;
|
||||
let mut was_in_quotes = false;
|
||||
let mut backslash_count: usize = 0;
|
||||
for &c in cmd_line {
|
||||
match c {
|
||||
// backslash
|
||||
BACKSLASH => {
|
||||
backslash_count += 1;
|
||||
was_in_quotes = false;
|
||||
}
|
||||
QUOTE if backslash_count % 2 == 0 => {
|
||||
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2));
|
||||
backslash_count = 0;
|
||||
if was_in_quotes {
|
||||
cur.push('"' as u16);
|
||||
was_in_quotes = false;
|
||||
} else {
|
||||
was_in_quotes = in_quotes;
|
||||
in_quotes = !in_quotes;
|
||||
}
|
||||
}
|
||||
QUOTE if backslash_count % 2 != 0 => {
|
||||
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2));
|
||||
backslash_count = 0;
|
||||
was_in_quotes = false;
|
||||
cur.push(b'"' as u16);
|
||||
}
|
||||
SPACE | TAB if !in_quotes => {
|
||||
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
|
||||
if !cur.is_empty() || was_in_quotes {
|
||||
ret_val.push(OsString::from_wide(&cur[..]));
|
||||
cur.truncate(0);
|
||||
}
|
||||
backslash_count = 0;
|
||||
was_in_quotes = false;
|
||||
}
|
||||
_ => {
|
||||
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
|
||||
backslash_count = 0;
|
||||
was_in_quotes = false;
|
||||
cur.push(c);
|
||||
}
|
||||
let mut cur = Vec::new();
|
||||
for w in &mut code_units {
|
||||
match w {
|
||||
// A quote mark always toggles `in_quotes` no matter what because
|
||||
// there are no escape characters when parsing the executable name.
|
||||
QUOTE => in_quotes = !in_quotes,
|
||||
// If not `in_quotes` then whitespace ends argv[0].
|
||||
SPACE | TAB if !in_quotes => break,
|
||||
// In all other cases the code unit is taken literally.
|
||||
_ => cur.push(w.get()),
|
||||
}
|
||||
}
|
||||
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
|
||||
// include empty quoted strings at the end of the arguments list
|
||||
if !cur.is_empty() || was_in_quotes || in_quotes {
|
||||
// Skip whitespace.
|
||||
code_units.advance_while(|w| w == SPACE || w == TAB);
|
||||
ret_val.push(OsString::from_wide(&cur));
|
||||
|
||||
// Parse the arguments according to these rules:
|
||||
// * All code units are taken literally except space, tab, quote and backslash.
|
||||
// * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are
|
||||
// treated as a single separator.
|
||||
// * A space or tab `in_quotes` is taken literally.
|
||||
// * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally.
|
||||
// * A quote can be escaped if preceded by an odd number of backslashes.
|
||||
// * If any number of backslashes is immediately followed by a quote then the number of
|
||||
// backslashes is halved (rounding down).
|
||||
// * Backslashes not followed by a quote are all taken literally.
|
||||
// * If `in_quotes` then a quote can also be escaped using another quote
|
||||
// (i.e. two consecutive quotes become one literal quote).
|
||||
let mut cur = Vec::new();
|
||||
let mut in_quotes = false;
|
||||
while let Some(w) = code_units.next() {
|
||||
match w {
|
||||
// If not `in_quotes`, a space or tab ends the argument.
|
||||
SPACE | TAB if !in_quotes => {
|
||||
ret_val.push(OsString::from_wide(&cur[..]));
|
||||
cur.truncate(0);
|
||||
|
||||
// Skip whitespace.
|
||||
code_units.advance_while(|w| w == SPACE || w == TAB);
|
||||
}
|
||||
// Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote.
|
||||
BACKSLASH => {
|
||||
let backslash_count = code_units.advance_while(|w| w == BACKSLASH) + 1;
|
||||
if code_units.peek() == Some(QUOTE) {
|
||||
cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count / 2));
|
||||
// The quote is escaped if there are an odd number of backslashes.
|
||||
if backslash_count % 2 == 1 {
|
||||
code_units.next();
|
||||
cur.push(QUOTE.get());
|
||||
}
|
||||
} else {
|
||||
// If there is no quote on the end then there is no escaping.
|
||||
cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count));
|
||||
}
|
||||
}
|
||||
// If `in_quotes` and not backslash escaped (see above) then a quote either
|
||||
// unsets `in_quote` or is escaped by another quote.
|
||||
QUOTE if in_quotes => match code_units.peek() {
|
||||
// Two consecutive quotes when `in_quotes` produces one literal quote.
|
||||
Some(QUOTE) => {
|
||||
cur.push(QUOTE.get());
|
||||
code_units.next();
|
||||
}
|
||||
// Otherwise set `in_quotes`.
|
||||
Some(_) => in_quotes = false,
|
||||
// The end of the command line.
|
||||
// Push `cur` even if empty, which we do by breaking while `in_quotes` is still set.
|
||||
None => break,
|
||||
},
|
||||
// If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`.
|
||||
QUOTE => in_quotes = true,
|
||||
// Everything else is always taken literally.
|
||||
_ => cur.push(w.get()),
|
||||
}
|
||||
}
|
||||
// Push the final argument, if any.
|
||||
if !cur.is_empty() || in_quotes {
|
||||
ret_val.push(OsString::from_wide(&cur[..]));
|
||||
}
|
||||
ret_val
|
||||
@ -187,3 +185,52 @@ impl ExactSizeIterator for Args {
|
||||
self.parsed_args_list.len()
|
||||
}
|
||||
}
|
||||
|
||||
/// A safe iterator over a LPWSTR
|
||||
/// (aka a pointer to a series of UTF-16 code units terminated by a NULL).
|
||||
struct WStrUnits<'a> {
|
||||
// The pointer must never be null...
|
||||
lpwstr: NonNull<u16>,
|
||||
// ...and the memory it points to must be valid for this lifetime.
|
||||
lifetime: PhantomData<&'a [u16]>,
|
||||
}
|
||||
impl WStrUnits<'_> {
|
||||
/// Create the iterator. Returns `None` if `lpwstr` is null.
|
||||
///
|
||||
/// SAFETY: `lpwstr` must point to a null-terminated wide string that lives
|
||||
/// at least as long as the lifetime of this struct.
|
||||
unsafe fn new(lpwstr: *const u16) -> Option<Self> {
|
||||
Some(Self { lpwstr: NonNull::new(lpwstr as _)?, lifetime: PhantomData })
|
||||
}
|
||||
fn peek(&self) -> Option<NonZeroU16> {
|
||||
// SAFETY: It's always safe to read the current item because we don't
|
||||
// ever move out of the array's bounds.
|
||||
unsafe { NonZeroU16::new(*self.lpwstr.as_ptr()) }
|
||||
}
|
||||
/// Advance the iterator while `predicate` returns true.
|
||||
/// Returns the number of items it advanced by.
|
||||
fn advance_while<P: FnMut(NonZeroU16) -> bool>(&mut self, mut predicate: P) -> usize {
|
||||
let mut counter = 0;
|
||||
while let Some(w) = self.peek() {
|
||||
if !predicate(w) {
|
||||
break;
|
||||
}
|
||||
counter += 1;
|
||||
self.next();
|
||||
}
|
||||
counter
|
||||
}
|
||||
}
|
||||
impl Iterator for WStrUnits<'_> {
|
||||
// This can never return zero as that marks the end of the string.
|
||||
type Item = NonZeroU16;
|
||||
fn next(&mut self) -> Option<NonZeroU16> {
|
||||
// SAFETY: If NULL is reached we immediately return.
|
||||
// Therefore it's safe to advance the pointer after that.
|
||||
unsafe {
|
||||
let next = self.peek()?;
|
||||
self.lpwstr = NonNull::new_unchecked(self.lpwstr.as_ptr().add(1));
|
||||
Some(next)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -781,7 +781,7 @@ extern "system" {
|
||||
pub fn RemoveDirectoryW(lpPathName: LPCWSTR) -> BOOL;
|
||||
pub fn SetFileAttributesW(lpFileName: LPCWSTR, dwFileAttributes: DWORD) -> BOOL;
|
||||
pub fn SetLastError(dwErrCode: DWORD);
|
||||
pub fn GetCommandLineW() -> *mut LPCWSTR;
|
||||
pub fn GetCommandLineW() -> LPWSTR;
|
||||
pub fn GetTempPathW(nBufferLength: DWORD, lpBuffer: LPCWSTR) -> DWORD;
|
||||
pub fn GetCurrentProcess() -> HANDLE;
|
||||
pub fn GetCurrentThread() -> HANDLE;
|
||||
|
Loading…
x
Reference in New Issue
Block a user