Added unit tests for windows1252 encoding/decoding.

2018-08-22 19:00:47 -07:00 · 2018-08-22 19:00:47 -07:00 · 3203da38bd
commit 3203da38bd
parent fb95ff36c1
1 changed files with 305 additions and 32 deletions
--- a/sub_crates/text_encoding/src/windows1252.rs
+++ b/sub_crates/text_encoding/src/windows1252.rs
@ -14,7 +14,7 @@ pub fn encode_from_str<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a
        if let Some(byte) = encode_table(c) {
            output[output_i] = byte;
            output_i += 1;
-            input_i = offset;
+            input_i = offset + 1;
        } else {
            return Err(EncodeError {
                character: c,
@ -25,7 +25,6 @@ pub fn encode_from_str<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a
    }

    // Calculate how much of the input was consumed.
-    input_i += 1;
    if input_i > input.len() {
        input_i = input.len();
    } else {
@ -94,33 +93,33 @@ fn encode_table(code: char) -> Option<u8> {
        return Some(code as u8);
    }
    match code {
-        '€' => Some(0x80),
-        '‚' => Some(0x82),
-        'ƒ' => Some(0x83),
-        '„' => Some(0x84),
-        '…' => Some(0x85),
-        '†' => Some(0x86),
-        '‡' => Some(0x87),
-        'ˆ' => Some(0x88),
-        '‰' => Some(0x89),
-        'Š' => Some(0x8A),
-        '‹' => Some(0x8B),
-        'Œ' => Some(0x8C),
-        'Ž' => Some(0x8E),
-        '‘' => Some(0x91),
-        '’' => Some(0x92),
-        '“' => Some(0x93),
-        '”' => Some(0x94),
-        '•' => Some(0x95),
-        '–' => Some(0x96),
-        '—' => Some(0x97),
-        '˜' => Some(0x98),
-        '™' => Some(0x99),
-        'š' => Some(0x9A),
-        '›' => Some(0x9B),
-        'œ' => Some(0x9C),
-        'ž' => Some(0x9E),
-        'Ÿ' => Some(0x9F),
+        '\u{20AC}' => Some(0x80),
+        '\u{201A}' => Some(0x82),
+        '\u{0192}' => Some(0x83),
+        '\u{201E}' => Some(0x84),
+        '\u{2026}' => Some(0x85),
+        '\u{2020}' => Some(0x86),
+        '\u{2021}' => Some(0x87),
+        '\u{02C6}' => Some(0x88),
+        '\u{2030}' => Some(0x89),
+        '\u{0160}' => Some(0x8A),
+        '\u{2039}' => Some(0x8B),
+        '\u{0152}' => Some(0x8C),
+        '\u{017D}' => Some(0x8E),
+        '\u{2018}' => Some(0x91),
+        '\u{2019}' => Some(0x92),
+        '\u{201C}' => Some(0x93),
+        '\u{201D}' => Some(0x94),
+        '\u{2022}' => Some(0x95),
+        '\u{2013}' => Some(0x96),
+        '\u{2014}' => Some(0x97),
+        '\u{02DC}' => Some(0x98),
+        '\u{2122}' => Some(0x99),
+        '\u{0161}' => Some(0x9A),
+        '\u{203A}' => Some(0x9B),
+        '\u{0153}' => Some(0x9C),
+        '\u{017E}' => Some(0x9E),
+        '\u{0178}' => Some(0x9F),
        _ => None,
    }
 }
@ -131,7 +130,281 @@ fn encode_table(code: char) -> Option<u8> {
 // The '<27>'s stand in for codes not defined in windows-1252, and should be
 // be treated as an error when encountered.
 const DECODE_TABLE: [char; 32] = [
-    '€', '<27>', '‚', 'ƒ', '„', '…', '†', '‡', 'ˆ', '‰', 'Š', '‹', 'Œ', '<27>',
-    'Ž', '<27>', '<27>', '‘', '’', '“', '”', '•', '–', '—', '˜', '™', 'š', '›',
-    'œ', '<27>', 'ž', 'Ÿ',
+    '\u{20AC}', '<27>', '\u{201A}', '\u{0192}', '\u{201E}', '\u{2026}', '\u{2020}', '\u{2021}',
+    '\u{02C6}', '\u{2030}', '\u{0160}', '\u{2039}', '\u{0152}', '<27>', '\u{017D}', '<27>', '<27>',
+    '\u{2018}', '\u{2019}', '\u{201C}', '\u{201D}', '\u{2022}', '\u{2013}', '\u{2014}', '\u{02DC}',
+    '\u{2122}', '\u{0161}', '\u{203A}', '\u{0153}', '<27>', '\u{017E}', '\u{0178}',
 ];
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn encode_01() {
+        let text = "Hello world!";
+        let mut buf = [0u8; 0];
+        let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap();
+        assert_eq!(consumed_count, 0);
+        assert_eq!(encoded, &[]);
+    }
+
+    #[test]
+    fn encode_02() {
+        let text = "Hello world!";
+        let mut buf = [0u8; 1];
+        let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap();
+        assert_eq!(consumed_count, 1);
+        assert_eq!(encoded, "H".as_bytes());
+    }
+
+    #[test]
+    fn encode_03() {
+        let text = "Hello world!";
+        let mut buf = [0u8; 2];
+        let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap();
+        assert_eq!(consumed_count, 2);
+        assert_eq!(encoded, "He".as_bytes());
+    }
+
+    #[test]
+    fn encode_04() {
+        let text = "Hello world!";
+        let mut buf = [0u8; 64];
+        let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap();
+        assert_eq!(consumed_count, 12);
+        assert_eq!(encoded, "Hello world!".as_bytes());
+    }
+
+    #[test]
+    fn encode_05() {
+        let text = "Hello world!こ";
+        let mut buf = [0u8; 12];
+        let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap();
+        assert_eq!(consumed_count, 12);
+        assert_eq!(encoded, "Hello world!".as_bytes());
+    }
+
+    #[test]
+    fn decode_01() {
+        let data = [
+            0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21,
+        ]; // "Hello world!"
+        let mut buf = [0u8; 0];
+        let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap();
+        assert_eq!(consumed_count, 0);
+        assert_eq!(decoded, "");
+    }
+
+    #[test]
+    fn decode_02() {
+        let data = [
+            0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21,
+        ]; // "Hello world!"
+        let mut buf = [0u8; 1];
+        let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap();
+        assert_eq!(consumed_count, 1);
+        assert_eq!(decoded, "H");
+    }
+
+    #[test]
+    fn decode_03() {
+        let data = [
+            0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21,
+        ]; // "Hello world!"
+        let mut buf = [0u8; 2];
+        let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap();
+        assert_eq!(consumed_count, 2);
+        assert_eq!(decoded, "He");
+    }
+
+    #[test]
+    fn decode_04() {
+        let data = [
+            0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21,
+        ]; // "Hello world!"
+        let mut buf = [0u8; 64];
+        let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap();
+        assert_eq!(consumed_count, 12);
+        assert_eq!(decoded, "Hello world!");
+    }
+
+    #[test]
+    fn decode_05() {
+        let data = [
+            0x80, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8E, 0x91,
+            0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9E, 0x9F,
+        ]; // "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ", all of the non-latin1 matching characters.
+        let mut buf = [0u8; 128];
+        let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap();
+        assert_eq!(consumed_count, 27);
+        assert_eq!(
+            decoded,
+            "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ"
+        );
+    }
+
+    #[test]
+    fn encode_error_01() {
+        let text = "こello world!";
+        let mut buf = [0u8; 64];
+        assert_eq!(
+            encode_from_str(text, &mut buf),
+            Err(EncodeError {
+                character: 'こ',
+                error_range: (0, 3),
+                output_bytes_written: 0,
+            })
+        );
+    }
+
+    #[test]
+    fn encode_error_02() {
+        let text = "\u{0085}ello world!";
+        let mut buf = [0u8; 64];
+        assert_eq!(
+            encode_from_str(text, &mut buf),
+            Err(EncodeError {
+                character: '\u{0085}',
+                error_range: (0, 2),
+                output_bytes_written: 0,
+            })
+        );
+    }
+
+    #[test]
+    fn encode_error_03() {
+        let text = "Hこllo world!";
+        let mut buf = [0u8; 64];
+        assert_eq!(
+            encode_from_str(text, &mut buf),
+            Err(EncodeError {
+                character: 'こ',
+                error_range: (1, 4),
+                output_bytes_written: 1,
+            })
+        );
+    }
+
+    #[test]
+    fn encode_error_04() {
+        let text = "H\u{0085}llo world!";
+        let mut buf = [0u8; 64];
+        assert_eq!(
+            encode_from_str(text, &mut buf),
+            Err(EncodeError {
+                character: '\u{0085}',
+                error_range: (1, 3),
+                output_bytes_written: 1,
+            })
+        );
+    }
+
+    #[test]
+    fn encode_error_05() {
+        let text = "Heこlo world!";
+        let mut buf = [0u8; 3];
+        assert_eq!(
+            encode_from_str(text, &mut buf),
+            Err(EncodeError {
+                character: 'こ',
+                error_range: (2, 5),
+                output_bytes_written: 2,
+            })
+        );
+    }
+
+    #[test]
+    fn encode_error_06() {
+        let text = "He\u{0085}lo world!";
+        let mut buf = [0u8; 3];
+        assert_eq!(
+            encode_from_str(text, &mut buf),
+            Err(EncodeError {
+                character: '\u{0085}',
+                error_range: (2, 4),
+                output_bytes_written: 2,
+            })
+        );
+    }
+
+    #[test]
+    fn decode_error_01() {
+        let data = [
+            0x48, 0x81, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21,
+        ]; // "Hello world!" with an error on the second byte (undefined byte).
+        let mut buf = [0u8; 64];
+        let error = decode_to_str(&data, &mut buf);
+        assert_eq!(
+            error,
+            Err(DecodeError {
+                error_range: (1, 2),
+                output_bytes_written: 1,
+            })
+        );
+    }
+
+    #[test]
+    fn decode_error_02() {
+        let data = [
+            0x48, 0x8D, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21,
+        ]; // "Hello world!" with an error on the second byte (undefined byte).
+        let mut buf = [0u8; 64];
+        let error = decode_to_str(&data, &mut buf);
+        assert_eq!(
+            error,
+            Err(DecodeError {
+                error_range: (1, 2),
+                output_bytes_written: 1,
+            })
+        );
+    }
+
+    #[test]
+    fn decode_error_03() {
+        let data = [
+            0x48, 0x8F, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21,
+        ]; // "Hello world!" with an error on the second byte (undefined byte).
+        let mut buf = [0u8; 64];
+        let error = decode_to_str(&data, &mut buf);
+        assert_eq!(
+            error,
+            Err(DecodeError {
+                error_range: (1, 2),
+                output_bytes_written: 1,
+            })
+        );
+    }
+
+    #[test]
+    fn decode_error_04() {
+        let data = [
+            0x48, 0x90, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21,
+        ]; // "Hello world!" with an error on the second byte (undefined byte).
+        let mut buf = [0u8; 64];
+        let error = decode_to_str(&data, &mut buf);
+        assert_eq!(
+            error,
+            Err(DecodeError {
+                error_range: (1, 2),
+                output_bytes_written: 1,
+            })
+        );
+    }
+
+    #[test]
+    fn decode_error_05() {
+        let data = [
+            0x48, 0x9D, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21,
+        ]; // "Hello world!" with an error on the second byte (undefined byte).
+        let mut buf = [0u8; 64];
+        let error = decode_to_str(&data, &mut buf);
+        assert_eq!(
+            error,
+            Err(DecodeError {
+                error_range: (1, 2),
+                output_bytes_written: 1,
+            })
+        );
+    }
+}