First implementation of the actual parser.

It doesn't do proper error handling yet, and is completely untested
so it probably has tons of bugs in it.
This commit is contained in:
Nathan Vegdahl 2022-06-19 17:11:43 -07:00
parent 838707d114
commit 82b6c5a5de
2 changed files with 386 additions and 208 deletions

View File

@ -1,189 +1,149 @@
use std::collections::{HashMap, HashSet}; use std::collections::HashMap;
use std::env::args_os; use std::env::args_os;
use std::ffi::{OsStr, OsString}; use std::ffi::OsString;
use std::ops::RangeBounds;
/// A command line argument parser. mod spec;
#[derive(Debug, Clone)]
pub struct Parser {
args: Vec<Arg>,
// Used to ensure we don't get duplicate arguments. pub use spec::Spec;
id_set: HashSet<String>,
long_set: HashSet<String>,
short_set: HashSet<String>,
}
impl Parser { pub fn parse(mut spec: Spec) -> ParsedArguments {
pub fn new() -> Parser { // Split into non-positional and positional arguments.
Parser { let (args, pos_args): (Vec<_>, Vec<_>) = spec
args: Vec::new(), .args
id_set: HashSet::new(), .drain(..)
long_set: HashSet::new(), .partition(|arg| arg.arg_type != spec::ArgType::Pos);
short_set: HashSet::new(),
}
}
/// Add a flag (bool) argument. // Validate positional arguments:
/// // - All required positional arguments should precede any optional
/// - `id`: the argument identifier, used for fetching argument // positional arguments.
/// matches. // - There should be at most a single positional multi-argument, and
/// - `flags`: the long and/or short argument flag strings. Must be // it must be at the end.
/// in the form "-f" or "--flag". You can pass as many as you {
/// like, all of which will be considered equivalent during let mut met_optional = false;
/// parsing. But there must be at least one. let mut met_multi = false;
/// - `doc`: the documentation string to use in the generated help. for arg in pos_args.iter() {
/// Pass an empty string to indicate no documentation. if arg.arg_type == spec::ArgType::Pos {
pub fn add_flag(&mut self, id: &str, flags: &[&str], doc: &str) { let is_optional = arg.acceptable_count.0 == 0;
let (long_flags, short_flags) = self.validate_and_process_arg(id, flags); let is_multi =
arg.acceptable_count.1.is_none() || arg.acceptable_count.1.unwrap() > 1;
self.args.push(Arg { if !is_optional && met_optional {
arg_type: ArgType::Flag, panic!("All required positional arguments must precede all optional positional arguments in the argument spec.")
id: id.into(), }
value_label: String::new(), if met_multi {
long_flags: long_flags, panic!("There must be at most one positional multi-argument in the argument spec, and it must come last.")
short_flags: short_flags, }
acceptable_count: (0, None),
doc: doc.into(),
});
}
/// Add a standard argument, that takes a value. met_optional |= is_optional;
pub fn add_argument( met_multi |= is_multi;
&mut self,
id: &str,
flags: &[&str],
doc: &str,
value_label: &str,
required: bool,
) {
let (long_flags, short_flags) = self.validate_and_process_arg(id, flags);
self.args.push(Arg {
arg_type: ArgType::Arg,
id: id.into(),
value_label: value_label.into(),
long_flags: long_flags,
short_flags: short_flags,
acceptable_count: (if required { 1 } else { 0 }, None),
doc: doc.into(),
});
}
/// Add a positional argument.
///
/// Unlike flags and standard arguments, positional arguments are
/// parsed in the order they're added. Because of their nature,
/// they have some additional considerations:
///
/// - All required positional arguments must precede all optional
/// positional arguments.
/// - There can at most be a single positional multi-argument,
/// which must come last. (See `add_positional_multi_argument()`.)
pub fn add_positional_argument(
&mut self,
id: &str,
doc: &str,
value_label: &str,
required: bool,
) {
let (_, _) = self.validate_and_process_arg(id, &[]);
self.args.push(Arg {
arg_type: ArgType::PosArg,
id: id.into(),
value_label: value_label.into(),
long_flags: Vec::new(),
short_flags: Vec::new(),
acceptable_count: (if required { 1 } else { 0 }, Some(1)),
doc: doc.into(),
});
}
pub fn add_positional_multi_argument(
&mut self,
id: &str,
doc: &str,
value_label: &str,
required: bool,
) {
let (_, _) = self.validate_and_process_arg(id, &[]);
self.args.push(Arg {
arg_type: ArgType::PosArg,
id: id.into(),
value_label: value_label.into(),
long_flags: Vec::new(),
short_flags: Vec::new(),
acceptable_count: (if required { 1 } else { 0 }, None),
doc: doc.into(),
});
}
//----------------
pub fn parse(self) -> ParsedArguments {
todo!()
}
//----------------
/// Returns (long, short) pair, each of which is a Vec of argument strings with
/// the leading hyphens stripped off.
fn validate_and_process_arg(&mut self, id: &str, flags: &[&str]) -> (Vec<String>, Vec<String>) {
if self.id_set.contains(id) {
panic!(
"Error: attempted to add argument with a duplicate ID \"{}\".",
id
);
}
self.id_set.insert(id.into());
let mut long_flags = Vec::new();
let mut short_flags = Vec::new();
for &flag in flags {
// Ensure no whitespace.
if flag.len() != flag.trim().len() || flag.split_whitespace().count() > 1 {
panic!(
"Error: attempted to add argument \"{}\" which contains whitespace.",
flag
);
} }
// Long flags. }
else if flag.starts_with("--") && flag.len() > 2 { }
if self.long_set.contains(flag) {
panic!( // Parse!
"Error: attempted to add duplicate long argument \"{}\".", // TODO: optimize by first creating a hash map from flag strings to
flag // argument indices. Right now this is an `O(NM)` algorithm, with N
// being the number of arguments in the spec and M being the number
// of arguments passed by the user. We can even further optimize it
// by first checking against the maximum length of our long arguments,
// so we don't end up hashing really long user arguments
// unnecessarily for the check.
let mut pos_i = 0; // Index of the positional argument we're at.
let mut pos_i_count = 0; // Number of positional arguments we've parsed at the current positional argument index.
let mut parsed = ParsedArguments {
arguments: Vec::new(),
id_map: HashMap::new(),
};
let mut args_in = args_os();
let _ = args_in.next(); // Skip the first argument, which is the call to the executable.
'outer: while let Some(arg_in) = args_in.next() {
// Check for flags and non-positional arguments.
if let Some(arg_in_str) = arg_in.to_str() {
if arg_in_str.starts_with("--") {
// Long.
for arg in args.iter() {
for long_flag in arg.long_flags.iter() {
if arg_in_str == long_flag {
match arg.arg_type {
spec::ArgType::Flag => parsed.push_arg(arg.id.clone(), None),
spec::ArgType::Arg => {
if let Some(value) = args_in.next() {
parsed.push_arg(arg.id.clone(), Some(value));
} else {
todo!("Handle error: expected value after argument flag.");
}
}
spec::ArgType::Pos => unreachable!(),
}
continue 'outer;
}
}
}
todo!("Handle error: no long argument matched the passed argument.");
} else if arg_in_str.starts_with("-") {
// Short.
let mut remainder = &arg_in_str[1..];
// First check arguments that take values.
for arg in args.iter().filter(|a| a.arg_type == spec::ArgType::Arg) {
for short_flag in arg.short_flags.iter() {
if remainder == short_flag {
if let Some(value) = args_in.next() {
parsed.push_arg(arg.id.clone(), Some(value));
} else {
todo!("Handle error: expected value after argument flag.");
}
continue 'outer;
}
}
}
// Then check boolean flags. There can be multiple
// present, so we progressively chop off the front as we
// find matches until nothing remains.
'restart_args: while !remainder.is_empty() {
for arg in args.iter().filter(|a| a.arg_type == spec::ArgType::Flag) {
for short_flag in arg.short_flags.iter() {
if remainder.starts_with(short_flag) {
remainder = &remainder[short_flag.len()..];
parsed.push_arg(arg.id.clone(), None);
continue 'restart_args;
}
}
}
todo!(
"Handle error: no short argument matches the next flag in \"{}\".",
remainder
); );
} }
self.long_set.insert(flag.into()); continue 'outer;
long_flags.push((&flag[2..]).into());
}
// Check if it's a valid short flag (should only have one character
// after the hyphen).
else if flag.starts_with("-") && flag.chars().count() == 2 {
if self.short_set.contains(flag) {
panic!(
"Error: attempted to add duplicate short argument \"{}\".",
flag
);
}
self.short_set.insert(flag.into());
short_flags.push((&flag[1..]).into());
}
// Not a valid flag.
else {
panic!(
"Error: attempted to add argument \"{}\", which isn't a valid argument string.",
flag
)
} }
} }
(long_flags, short_flags) if pos_i < pos_args.len() {
let arg = &pos_args[pos_i];
parsed.push_arg(arg.id.clone(), Some(arg_in));
pos_i_count += 1;
if let Some(max_count) = arg.acceptable_count.1 {
if pos_i_count == max_count {
pos_i += 1;
pos_i_count = 0;
}
}
} else {
todo!("Handle error: too many positional arguments.");
}
} }
if pos_i < pos_args.len() && pos_i_count < pos_args[pos_i].acceptable_count.0 {
todo!("Handle error: not enough positional arguments.");
}
parsed
} }
/// Parsed command line arguments. /// Parsed command line arguments.
@ -199,41 +159,15 @@ pub struct ParsedArguments {
id_map: HashMap<String, Vec<usize>>, // Argument ID -> index list id_map: HashMap<String, Vec<usize>>, // Argument ID -> index list
} }
//------------------------------------------------------------- impl ParsedArguments {
fn push_arg(&mut self, id: String, value: Option<OsString>) {
assert!(!id.is_empty());
#[derive(Debug, Copy, Clone, Eq, PartialEq)] if !self.id_map.contains_key(&id) {
enum ArgType { self.id_map.insert(id.clone(), Vec::new());
Flag, }
Arg, self.id_map.get_mut(&id).unwrap().push(self.arguments.len());
PosArg,
} self.arguments.push((id, value));
}
/// Argument specification.
#[derive(Debug, Clone)]
struct Arg {
arg_type: ArgType,
id: String,
value_label: String,
// Long and short versions of the argument flag. E.g. "--curve" and
// "-c", but without the leading dashes.
long_flags: Vec<String>,
short_flags: Vec<String>,
// How many instances of the argument can be present, specified
// as a range.
//
// For example:
// - (0, None): An argument that can show up any number of times,
// including not at all.
// - (0, 1): An argument that can either be absent or show up
// precisely once.
// - (1, 1): An argument that must show up precisely once.
// - (1, None): An argument that must show up at least once.
// - (2, 9): An argument that must show up at least twice, but
// no more than 9 times.
acceptable_count: (usize, Option<usize>),
// Documentation string, for generated help.
doc: String,
} }

244
src/spec.rs Normal file
View File

@ -0,0 +1,244 @@
use std::collections::HashSet;
/// Command line argument specification.
#[derive(Debug, Clone)]
pub struct Spec {
pub(crate) name: String, // Application name.
pub(crate) version: String, // Application version.
pub(crate) args: Vec<Arg>,
// Used to ensure we don't get duplicate arguments.
id_set: HashSet<String>,
long_set: HashSet<String>,
short_set: HashSet<String>,
}
impl Spec {
/// Create a new argument specification.
///
/// `name` and `version` are the name and version of the software,
/// respectively.
#[must_use]
pub fn new(name: String, version: String) -> Spec {
Spec {
name: name,
version: version,
args: Vec::new(),
id_set: HashSet::new(),
long_set: HashSet::new(),
short_set: HashSet::new(),
}
}
/// Add a flag (bool) argument.
///
/// - `id`: the argument identifier, used for fetching argument
/// matches.
/// - `flags`: the long and/or short argument flag strings. Must be
/// in the form "-f" or "--flag". You can pass as many as you
/// like, all of which will be considered equivalent during
/// parsing. But there must be at least one.
/// - `doc`: the documentation string to use in the generated help.
/// Pass an empty string to indicate no documentation.
#[must_use]
pub fn add_flag(mut self, id: &str, flags: &[&str], doc: &str) -> Self {
let (long_flags, short_flags) = self.validate_and_process_arg(id, flags);
self.args.push(Arg {
arg_type: ArgType::Flag,
id: id.into(),
value_label: String::new(),
long_flags: long_flags,
short_flags: short_flags,
acceptable_count: (0, None),
doc: doc.into(),
});
self
}
/// Add a standard argument, that takes a value.
#[must_use]
pub fn add_argument(
mut self,
id: &str,
flags: &[&str],
doc: &str,
value_label: &str,
required: bool,
) -> Self {
let (long_flags, short_flags) = self.validate_and_process_arg(id, flags);
self.args.push(Arg {
arg_type: ArgType::Arg,
id: id.into(),
value_label: value_label.into(),
long_flags: long_flags,
short_flags: short_flags,
acceptable_count: (if required { 1 } else { 0 }, None),
doc: doc.into(),
});
self
}
/// Add a positional argument.
///
/// Unlike flags and standard arguments, positional arguments are
/// parsed in the order they're added. Because of their nature,
/// they have some additional considerations:
///
/// - All required positional arguments must precede all optional
/// positional arguments.
/// - There can at most be a single positional multi-argument,
/// which must come last. (See `add_positional_multi_argument()`.)
#[must_use]
pub fn add_positional_argument(
mut self,
id: &str,
doc: &str,
value_label: &str,
required: bool,
) -> Self {
let (_, _) = self.validate_and_process_arg(id, &[]);
self.args.push(Arg {
arg_type: ArgType::Pos,
id: id.into(),
value_label: value_label.into(),
long_flags: Vec::new(),
short_flags: Vec::new(),
acceptable_count: (if required { 1 } else { 0 }, Some(1)),
doc: doc.into(),
});
self
}
#[must_use]
pub fn add_positional_multi_argument(
mut self,
id: &str,
doc: &str,
value_label: &str,
required: bool,
) -> Self {
let (_, _) = self.validate_and_process_arg(id, &[]);
self.args.push(Arg {
arg_type: ArgType::Pos,
id: id.into(),
value_label: value_label.into(),
long_flags: Vec::new(),
short_flags: Vec::new(),
acceptable_count: (if required { 1 } else { 0 }, None),
doc: doc.into(),
});
self
}
//----------------
/// Returns (long, short) pair, each of which is a Vec of argument strings with
/// the leading hyphens stripped off.
fn validate_and_process_arg(&mut self, id: &str, flags: &[&str]) -> (Vec<String>, Vec<String>) {
if self.id_set.contains(id) {
panic!(
"Error: attempted to add argument with a duplicate ID \"{}\".",
id
);
}
self.id_set.insert(id.into());
let mut long_flags = Vec::new();
let mut short_flags = Vec::new();
for &flag in flags {
// Ensure no whitespace.
if flag.len() != flag.trim().len() || flag.split_whitespace().count() > 1 {
panic!(
"Error: attempted to add argument \"{}\" which contains whitespace.",
flag
);
}
// Long flags.
else if flag.starts_with("--") && flag.len() > 2 {
if self.long_set.contains(flag) {
panic!(
"Error: attempted to add duplicate long argument \"{}\".",
flag
);
}
self.long_set.insert(flag.into());
long_flags.push(flag.into());
}
// Check if it's a valid short flag.
// Note: in theory we should be checking to verify that
// the flag is only one character long. But because of
// graphemes that's complicated, and it's not really
// worth all the code. So instead we just rely on client
// code doing the right thing.
else if flag.starts_with("-") && flag.len() > 1 {
if self.short_set.contains(flag) {
panic!(
"Error: attempted to add duplicate short argument \"{}\".",
flag
);
}
self.short_set.insert(flag.into());
short_flags.push((&flag[1..]).into());
}
// Not a valid flag.
else {
panic!(
"Error: attempted to add argument \"{}\", which isn't a valid argument string.",
flag
)
}
}
(long_flags, short_flags)
}
}
//-------------------------------------------------------------
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
pub(crate) enum ArgType {
Flag, // Boolean flag (present or absent)
Arg, // Standard flag+value argument, like `-i input_file`.
Pos, // Positional argument.
}
/// Argument specification.
#[derive(Debug, Clone)]
pub(crate) struct Arg {
pub(crate) arg_type: ArgType,
pub(crate) id: String,
pub(crate) value_label: String,
// Long and short versions of the argument flag. E.g. "--curve" and
// "-c", but without the leading dashes.
pub(crate) long_flags: Vec<String>,
pub(crate) short_flags: Vec<String>,
// How many instances of the argument can be present, specified
// as a range.
//
// For example:
// - (0, None): An argument that can show up any number of times,
// including not at all.
// - (0, 1): An argument that can either be absent or show up
// precisely once.
// - (1, 1): An argument that must show up precisely once.
// - (1, None): An argument that must show up at least once.
// - (2, 9): An argument that must show up at least twice, but
// no more than 9 times.
pub(crate) acceptable_count: (usize, Option<usize>),
// Documentation string, for generated help.
pub(crate) doc: String,
}