Added Windows-1252 support to text_encoding sub-crate.

2018-08-20 21:09:01 -07:00 · 2018-08-20 21:09:01 -07:00 · a82b06794e
commit a82b06794e
parent 3ea4e25fbe
2 changed files with 145 additions and 6 deletions
--- a/sub_crates/text_encoding/src/lib.rs
+++ b/sub_crates/text_encoding/src/lib.rs
@ -5,6 +5,7 @@ mod latin1;
 mod utf16_be;
 mod utf16_le;
 mod utf8;
+mod windows1252;

 /// Encodes text from utf8 to a destination encoding.
 pub fn encode_from_utf8<'a>(
@ -17,7 +18,7 @@ pub fn encode_from_utf8<'a>(
        Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
        Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output),
        Encoding::Latin1 => latin1::encode_from_utf8(input, output),
-        _ => unimplemented!(),
+        Encoding::Windows1252 => windows1252::encode_from_utf8(input, output),
    }
 }

@ -32,7 +33,7 @@ pub fn decode_to_utf8<'a>(
        Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output),
        Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output),
        Encoding::Latin1 => latin1::decode_to_utf8(input, output),
-        _ => unimplemented!(),
+        Encoding::Windows1252 => windows1252::decode_to_utf8(input, output),
    }
 }

@ -42,10 +43,11 @@ pub enum Encoding {
    Utf8,
    Utf16BE, // Big endian
    Utf16LE, // Little endian
-    Utf32BE, // Big endian
-    Utf32LE, // Little endian
-    ShiftJIS,
-    Big5,
+    // Utf32BE, // Big endian
+    // Utf32LE, // Little endian
+    // ShiftJIS,
+    // EUC_JP,
+    // Big5,
    Latin1,      // ISO/IEC 8859-1
    Windows1252, // Windows code page 1252
 }
--- a/sub_crates/text_encoding/src/windows1252.rs
+++ b/sub_crates/text_encoding/src/windows1252.rs
@ -0,0 +1,137 @@
+//! Encoding/decoding functions for Windows-1252.
+
+use std;
+use {DecodeError, DecodeResult, EncodeError, EncodeResult};
+
+pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
+    // Do the encode.
+    let mut input_i = 0;
+    let mut output_i = 0;
+    for (offset, c) in input.char_indices() {
+        if output_i >= output.len() {
+            break;
+        }
+        if let Some(byte) = encode_table(c) {
+            output[output_i] = byte;
+            output_i += 1;
+            input_i = offset;
+        } else {
+            return Err(EncodeError {
+                character: c,
+                error_range: (offset, offset + c.len_utf8()),
+                output_bytes_written: output_i,
+            });
+        }
+    }
+
+    // Calculate how much of the input was consumed.
+    input_i += 1;
+    if input_i > input.len() {
+        input_i = input.len();
+    } else {
+        while !input.is_char_boundary(input_i) {
+            input_i += 1;
+        }
+    }
+
+    Ok((input_i, &output[..output_i]))
+}
+
+pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
+    let mut input_i = 0;
+    let mut output_i = 0;
+    for &byte in input.iter() {
+        if byte < 0x80 {
+            // 1-byte case
+            if output_i >= output.len() {
+                break;
+            }
+            output[output_i] = byte;
+            input_i += 1;
+            output_i += 1;
+        } else if byte < 0xA0 {
+            // Use lookup table.
+            let code = DECODE_TABLE[byte as usize - 0x80];
+            if code == '<27>' {
+                // Error: undefined byte.
+                return Err(DecodeError {
+                    error_range: (input_i, input_i + 1),
+                    output_bytes_written: output_i,
+                });
+            }
+            // Encode to utf8
+            let mut buf = [0u8; 4];
+            let s = code.encode_utf8(&mut buf);
+            if (output_i + s.len()) > output.len() {
+                break;
+            }
+            output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
+            input_i += 1;
+            output_i += s.len();
+        } else {
+            // Non-lookup-table 2-byte case
+            if (output_i + 1) >= output.len() {
+                break;
+            }
+            output[output_i] = 0b11000000 | (byte >> 6);
+            output[output_i + 1] = 0b10000000 | (byte & 0b00111111);
+            input_i += 1;
+            output_i += 2;
+        }
+    }
+
+    Ok((input_i, unsafe {
+        std::str::from_utf8_unchecked(&output[..output_i])
+    }))
+}
+
+// Maps unicode to windows-1252.
+//
+// Returns `None` for characters not in windows-1252.
+#[inline(always)]
+fn encode_table(code: char) -> Option<u8> {
+    if (code as u32) < 0x80 || ((code as u32) > 0x9F && (code as u32) <= 0xFF) {
+        return Some(code as u8);
+    }
+    match code {
+        '€' => Some(0x80),
+        '‚' => Some(0x82),
+        'ƒ' => Some(0x83),
+        '„' => Some(0x84),
+        '…' => Some(0x85),
+        '†' => Some(0x86),
+        '‡' => Some(0x87),
+        'ˆ' => Some(0x88),
+        '‰' => Some(0x89),
+        'Š' => Some(0x8A),
+        '‹' => Some(0x8B),
+        'Œ' => Some(0x8C),
+        'Ž' => Some(0x8E),
+        '‘' => Some(0x91),
+        '’' => Some(0x92),
+        '“' => Some(0x93),
+        '”' => Some(0x94),
+        '•' => Some(0x95),
+        '–' => Some(0x96),
+        '—' => Some(0x97),
+        '˜' => Some(0x98),
+        '™' => Some(0x99),
+        'š' => Some(0x9A),
+        '›' => Some(0x9B),
+        'œ' => Some(0x9C),
+        'ž' => Some(0x9E),
+        'Ÿ' => Some(0x9F),
+        _ => None,
+    }
+}
+
+// Maps the range 0x80-0x9F in windows-1252 to unicode.  The remaining
+// characters in windows-1252 match unicode.
+//
+// The '<27>'s stand in for codes not defined in windows-1252, and should be
+// be treated as an error when encountered.
+const DECODE_TABLE: [char; 32] = [
+    '€', '<27>', '‚', 'ƒ', '„', '…', '†', '‡', 'ˆ', '‰', 'Š', '‹', 'Œ', '<27>',
+    'Ž', '<27>', '<27>', '‘', '’', '“', '”', '•', '–', '—', '˜', '™', 'š', '›',
+    'œ', '<27>', 'ž', 'Ÿ',
+];