From f117fda29e4b0802a78d943e4c583661aeb4bde9 Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Tue, 11 Feb 2020 20:54:41 +0900 Subject: [PATCH] Cleaning up a bunch of old unused code. --- src/string_utils.rs | 349 ++++++--------------------------------- src/term_ui/formatter.rs | 13 -- 2 files changed, 47 insertions(+), 315 deletions(-) diff --git a/src/string_utils.rs b/src/string_utils.rs index eeabb8e..8626bff 100644 --- a/src/string_utils.rs +++ b/src/string_utils.rs @@ -1,24 +1,11 @@ -#![allow(dead_code)] //! Misc helpful utility functions for TextBuffer related stuff. -use std::iter::repeat; - -use ropey::RopeSlice; -use unicode_segmentation::UnicodeSegmentation; +use ropey::{str_utils::byte_to_char_idx, RopeSlice}; pub fn is_line_ending(text: &str) -> bool { - match text { - "\u{000D}\u{000A}" | "\u{000A}" | "\u{000B}" | "\u{000C}" | "\u{000D}" | "\u{0085}" - | "\u{2028}" | "\u{2029}" => true, - - _ => false, - } -} - -pub fn rope_slice_is_line_ending(text: &RopeSlice) -> bool { - match text.char(0) { - c if (c >= '\u{000A}' && c <= '\u{000D}') => true, - '\u{0085}' | '\u{2028}' | '\u{2029}' => true, + match text.chars().nth(0) { + Some(c) if (c >= '\u{000A}' && c <= '\u{000D}') => true, + Some('\u{0085}') | Some('\u{2028}') | Some('\u{2029}') => true, _ => false, } } @@ -27,237 +14,30 @@ pub fn is_whitespace(text: &str) -> bool { // TODO: this is a naive categorization of whitespace characters. // For better categorization these should be split up into groups // based on e.g. breaking vs non-breaking spaces, among other things. - match text { - "\u{0020}" // SPACE - | "\u{0009}" // CHARACTER TABULATION - | "\u{00A0}" // NO-BREAK SPACE - //| "\u{1680}" // OGHAM SPACE MARK (here for completeness, but usually displayed as a dash, not as whitespace) - | "\u{180E}" // MONGOLIAN VOWEL SEPARATOR - | "\u{2000}" // EN QUAD - | "\u{2001}" // EM QUAD - | "\u{2002}" // EN SPACE - | "\u{2003}" // EM SPACE - | "\u{2004}" // THREE-PER-EM SPACE - | "\u{2005}" // FOUR-PER-EM SPACE - | "\u{2006}" // SIX-PER-EM SPACE - | "\u{2007}" // FIGURE SPACE - | "\u{2008}" // PUNCTUATION SPACE - | "\u{2009}" // THIN SPACE - | "\u{200A}" // HAIR SPACE - | "\u{200B}" // ZERO WIDTH SPACE - | "\u{202F}" // NARROW NO-BREAK SPACE - | "\u{205F}" // MEDIUM MATHEMATICAL SPACE - | "\u{3000}" // IDEOGRAPHIC SPACE - | "\u{FEFF}" // ZERO WIDTH NO-BREAK SPACE + match text.chars().nth(0) { + //Some('\u{1680}') | // OGHAM SPACE MARK (here for completeness, but usually displayed as a dash, not as whitespace) + Some('\u{0009}') | // CHARACTER TABULATION + Some('\u{0020}') | // SPACE + Some('\u{00A0}') | // NO-BREAK SPACE + Some('\u{180E}') | // MONGOLIAN VOWEL SEPARATOR + Some('\u{202F}') | // NARROW NO-BREAK SPACE + Some('\u{205F}') | // MEDIUM MATHEMATICAL SPACE + Some('\u{3000}') | // IDEOGRAPHIC SPACE + Some('\u{FEFF}') // ZERO WIDTH NO-BREAK SPACE => true, + + // EN QUAD, EM QUAD, EN SPACE, EM SPACE, THREE-PER-EM SPACE, + // FOUR-PER-EM SPACE, SIX-PER-EM SPACE, FIGURE SPACE, + // PUNCTUATION SPACE, THIN SPACE, HAIR SPACE, ZERO WIDTH SPACE. + Some(c) if c >= '\u{2000}' && c <= '\u{200B}' => true, + + // None, or not a matching whitespace character. _ => false, } } -pub fn rope_slice_is_whitespace(text: &RopeSlice) -> bool { - // TODO: this is a naive categorization of whitespace characters. - // For better categorization these should be split up into groups - // based on e.g. breaking vs non-breaking spaces, among other things. - - if let Some(text) = text.as_str() { - is_whitespace(text) - } else { - text == "\u{0020}" // SPACE - || text == "\u{0009}" // CHARACTER TABULATION - || text == "\u{00A0}" // NO-BREAK SPACE - //|| "\u{1680}" // OGHAM SPACE MARK (here for completeness, but usually displayed as a dash, not as whitespace) - || text == "\u{180E}" // MONGOLIAN VOWEL SEPARATOR - || text == "\u{2000}" // EN QUAD - || text == "\u{2001}" // EM QUAD - || text == "\u{2002}" // EN SPACE - || text == "\u{2003}" // EM SPACE - || text == "\u{2004}" // THREE-PER-EM SPACE - || text == "\u{2005}" // FOUR-PER-EM SPACE - || text == "\u{2006}" // SIX-PER-EM SPACE - || text == "\u{2007}" // FIGURE SPACE - || text == "\u{2008}" // PUNCTUATION SPACE - || text == "\u{2009}" // THIN SPACE - || text == "\u{200A}" // HAIR SPACE - || text == "\u{200B}" // ZERO WIDTH SPACE - || text == "\u{202F}" // NARROW NO-BREAK SPACE - || text == "\u{205F}" // MEDIUM MATHEMATICAL SPACE - || text == "\u{3000}" // IDEOGRAPHIC SPACE - || text == "\u{FEFF}" // ZERO WIDTH NO-BREAK SPACE - } -} - -pub fn line_ending_count(text: &str) -> usize { - let mut count = 0; - for g in UnicodeSegmentation::graphemes(text, true) { - if is_line_ending(g) { - count += 1; - } - } - return count; -} - pub fn char_count(text: &str) -> usize { - let mut count = 0; - for _ in text.chars() { - count += 1; - } - return count; -} - -pub fn grapheme_count(text: &str) -> usize { - let mut count = 0; - for _ in UnicodeSegmentation::graphemes(text, true) { - count += 1; - } - return count; -} - -pub fn grapheme_count_is_less_than(text: &str, n: usize) -> bool { - let mut count = 0; - for _ in UnicodeSegmentation::graphemes(text, true) { - count += 1; - if count >= n { - return false; - } - } - - return true; -} - -pub fn grapheme_and_line_ending_count(text: &str) -> (usize, usize) { - let mut grapheme_count = 0; - let mut line_ending_count = 0; - - for g in UnicodeSegmentation::graphemes(text, true) { - grapheme_count += 1; - if is_line_ending(g) { - line_ending_count += 1; - } - } - - return (grapheme_count, line_ending_count); -} - -pub fn char_pos_to_byte_pos(text: &str, pos: usize) -> usize { - let mut i: usize = 0; - - for (offset, _) in text.char_indices() { - if i == pos { - return offset; - } - i += 1; - } - - if i == pos { - return text.len(); - } - - panic!("char_pos_to_byte_pos(): char position off the end of the string."); -} - -pub fn grapheme_pos_to_byte_pos(text: &str, pos: usize) -> usize { - let mut i: usize = 0; - - for (offset, _) in UnicodeSegmentation::grapheme_indices(text, true) { - if i == pos { - return offset; - } - i += 1; - } - - if i == pos { - return text.len(); - } - - panic!("grapheme_pos_to_byte_pos(): grapheme position off the end of the string."); -} - -/// Inserts the given text into the given string at the given grapheme index. -pub fn insert_text_at_grapheme_index(s: &mut String, text: &str, pos: usize) { - // Find insertion position in bytes - let byte_pos = grapheme_pos_to_byte_pos(&s[..], pos); - - // Get byte vec of string - let byte_vec = unsafe { s.as_mut_vec() }; - - // Grow data size - byte_vec.extend(repeat(0).take(text.len())); - - // Move old bytes forward - // TODO: use copy_memory()...? - let mut from = byte_vec.len() - text.len(); - let mut to = byte_vec.len(); - while from > byte_pos { - from -= 1; - to -= 1; - - byte_vec[to] = byte_vec[from]; - } - - // Copy new bytes in - // TODO: use copy_memory() - let mut i = byte_pos; - for g in UnicodeSegmentation::graphemes(text, true) { - for b in g.bytes() { - byte_vec[i] = b; - i += 1 - } - } -} - -/// Removes the text between the given grapheme indices in the given string. -pub fn remove_text_between_grapheme_indices(s: &mut String, pos_a: usize, pos_b: usize) { - // Bounds checks - assert!( - pos_a <= pos_b, - "remove_text_between_grapheme_indices(): pos_a must be less than or equal to pos_b." - ); - - if pos_a == pos_b { - return; - } - - // Find removal positions in bytes - // TODO: get both of these in a single pass - let byte_pos_a = grapheme_pos_to_byte_pos(&s[..], pos_a); - let byte_pos_b = grapheme_pos_to_byte_pos(&s[..], pos_b); - - // Get byte vec of string - let byte_vec = unsafe { s.as_mut_vec() }; - - // Move bytes to fill in the gap left by the removed bytes - let mut from = byte_pos_b; - let mut to = byte_pos_a; - while from < byte_vec.len() { - byte_vec[to] = byte_vec[from]; - - from += 1; - to += 1; - } - - // Remove data from the end - let final_text_size = byte_vec.len() + byte_pos_a - byte_pos_b; - byte_vec.truncate(final_text_size); -} - -/// Splits a string into two strings at the grapheme index given. -/// The first section of the split is stored in the original string, -/// while the second section of the split is returned as a new string. -pub fn split_string_at_grapheme_index(s1: &mut String, pos: usize) -> String { - let mut s2 = String::new(); - - // Code block to contain the borrow of s2 - { - let byte_pos = grapheme_pos_to_byte_pos(&s1[..], pos); - - let byte_vec_1 = unsafe { s1.as_mut_vec() }; - let byte_vec_2 = unsafe { s2.as_mut_vec() }; - - byte_vec_2.extend((&byte_vec_1[byte_pos..]).iter().cloned()); - byte_vec_1.truncate(byte_pos); - } - - return s2; + byte_to_char_idx(text, text.len()) } /// Represents one of the valid Unicode line endings. @@ -277,54 +57,17 @@ pub enum LineEnding { pub fn str_to_line_ending(g: &str) -> LineEnding { match g { - // ============== - // Line endings - // ============== - // - // CRLF - "\u{000D}\u{000A}" => { - return LineEnding::CRLF; - } - - // LF - "\u{000A}" => { - return LineEnding::LF; - } - - // VT - "\u{000B}" => { - return LineEnding::VT; - } - - // FF - "\u{000C}" => { - return LineEnding::FF; - } - - // CR - "\u{000D}" => { - return LineEnding::CR; - } - - // NEL - "\u{0085}" => { - return LineEnding::NEL; - } - - // LS - "\u{2028}" => { - return LineEnding::LS; - } - - // PS - "\u{2029}" => { - return LineEnding::PS; - } + "\u{000D}\u{000A}" => LineEnding::CRLF, + "\u{000A}" => LineEnding::LF, + "\u{000B}" => LineEnding::VT, + "\u{000C}" => LineEnding::FF, + "\u{000D}" => LineEnding::CR, + "\u{0085}" => LineEnding::NEL, + "\u{2028}" => LineEnding::LS, + "\u{2029}" => LineEnding::PS, // Not a line ending - _ => { - return LineEnding::None; - } + _ => LineEnding::None, } } @@ -333,20 +76,6 @@ pub fn rope_slice_to_line_ending(g: &RopeSlice) -> LineEnding { str_to_line_ending(text) } else if g == "\u{000D}\u{000A}" { LineEnding::CRLF - } else if g == "\u{000A}" { - LineEnding::LF - } else if g == "\u{000B}" { - LineEnding::VT - } else if g == "\u{000C}" { - LineEnding::FF - } else if g == "\u{000D}" { - LineEnding::CR - } else if g == "\u{0085}" { - LineEnding::NEL - } else if g == "\u{2028}" { - LineEnding::LS - } else if g == "\u{2029}" { - LineEnding::PS } else { // Not a line ending LineEnding::None @@ -370,3 +99,19 @@ pub const LINE_ENDINGS: [&'static str; 9] = [ "\u{2028}", "\u{2029}", ]; + +//-------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn char_count_1() { + let text_1 = "Hello world!"; + let text_2 = "今日はみんなさん!"; + + assert_eq!(12, char_count(text_1)); + assert_eq!(9, char_count(text_2)); + } +} diff --git a/src/term_ui/formatter.rs b/src/term_ui/formatter.rs index d91d9ae..387394c 100644 --- a/src/term_ui/formatter.rs +++ b/src/term_ui/formatter.rs @@ -236,19 +236,6 @@ mod tests { assert_eq!(f.dimensions(RopeGraphemes::new(&text.slice(..))), (1, 22)); } - #[test] - fn dimensions_2() { - let text = Rope::from_str("Hello there, stranger! How are you doing this fine day?"); // 56 graphemes long - - let mut f = ConsoleLineFormatter::new(4); - f.wrap_type = WrapType::CharWrap(0); - f.maintain_indent = false; - f.wrap_additional_indent = 0; - f.set_wrap_width(12); - - assert_eq!(f.dimensions(RopeGraphemes::new(&text.slice(..))), (5, 12)); - } - #[test] fn dimensions_3() { let text = Rope::from_str("Hello there, stranger! How are you doing this fine day?"); // 56 graphemes long