Added Windows-1252 support to text_encoding sub-crate.

2018-08-20 21:09:01 -07:00 · 2018-08-20 21:09:01 -07:00 · a82b06794e
commit a82b06794e
parent 3ea4e25fbe
2 changed files with 145 additions and 6 deletions
--- a/sub_crates/text_encoding/src/lib.rs
+++ b/sub_crates/text_encoding/src/lib.rs
@ -5,6 +5,7 @@ mod latin1;
 mod utf16_be;
 mod utf16_le;
 mod utf8;
 mod windows1252;
 /// Encodes text from utf8 to a destination encoding.
 pub fn encode_from_utf8<'a>(
@ -17,7 +18,7 @@ pub fn encode_from_utf8<'a>(
        Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
        Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output),
        Encoding::Latin1 => latin1::encode_from_utf8(input, output),
-        _ => unimplemented!(),
+        Encoding::Windows1252 => windows1252::encode_from_utf8(input, output),
    }
 }
@ -32,7 +33,7 @@ pub fn decode_to_utf8<'a>(
        Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output),
        Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output),
        Encoding::Latin1 => latin1::decode_to_utf8(input, output),
-        _ => unimplemented!(),
+        Encoding::Windows1252 => windows1252::decode_to_utf8(input, output),
    }
 }
@ -42,10 +43,11 @@ pub enum Encoding {
    Utf8,
    Utf16BE, // Big endian
    Utf16LE, // Little endian
-    Utf32BE, // Big endian
+    // Utf32BE, // Big endian
-    Utf32LE, // Little endian
+    // Utf32LE, // Little endian
-    ShiftJIS,
+    // ShiftJIS,
-    Big5,
+    // EUC_JP,
    // Big5,
    Latin1,      // ISO/IEC 8859-1
    Windows1252, // Windows code page 1252
 }
--- a/sub_crates/text_encoding/src/windows1252.rs
+++ b/sub_crates/text_encoding/src/windows1252.rs
@ -0,0 +1,137 @@
 //! Encoding/decoding functions for Windows-1252.
 use std;
 use {DecodeError, DecodeResult, EncodeError, EncodeResult};
 pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
    // Do the encode.
    let mut input_i = 0;
    let mut output_i = 0;
    for (offset, c) in input.char_indices() {
        if output_i >= output.len() {
            break;
        }
        if let Some(byte) = encode_table(c) {
            output[output_i] = byte;
            output_i += 1;
            input_i = offset;
        } else {
            return Err(EncodeError {
                character: c,
                error_range: (offset, offset + c.len_utf8()),
                output_bytes_written: output_i,
            });
        }
    }
    // Calculate how much of the input was consumed.
    input_i += 1;
    if input_i > input.len() {
        input_i = input.len();
    } else {
        while !input.is_char_boundary(input_i) {
            input_i += 1;
        }
    }
    Ok((input_i, &output[..output_i]))
 }
 pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
    let mut input_i = 0;
    let mut output_i = 0;
    for &byte in input.iter() {
        if byte < 0x80 {
            // 1-byte case
            if output_i >= output.len() {
                break;
            }
            output[output_i] = byte;
            input_i += 1;
            output_i += 1;
        } else if byte < 0xA0 {
            // Use lookup table.
            let code = DECODE_TABLE[byte as usize - 0x80];
            if code == '<27>' {
                // Error: undefined byte.
                return Err(DecodeError {
                    error_range: (input_i, input_i + 1),
                    output_bytes_written: output_i,
                });
            }
            // Encode to utf8
            let mut buf = [0u8; 4];
            let s = code.encode_utf8(&mut buf);
            if (output_i + s.len()) > output.len() {
                break;
            }
            output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
            input_i += 1;
            output_i += s.len();
        } else {
            // Non-lookup-table 2-byte case
            if (output_i + 1) >= output.len() {
                break;
            }
            output[output_i] = 0b11000000 | (byte >> 6);
            output[output_i + 1] = 0b10000000 | (byte & 0b00111111);
            input_i += 1;
            output_i += 2;
        }
    }
    Ok((input_i, unsafe {
        std::str::from_utf8_unchecked(&output[..output_i])
    }))
 }
 // Maps unicode to windows-1252.
 //
 // Returns `None` for characters not in windows-1252.
 #[inline(always)]
 fn encode_table(code: char) -> Option<u8> {
    if (code as u32) < 0x80 || ((code as u32) > 0x9F && (code as u32) <= 0xFF) {
        return Some(code as u8);
    }
    match code {
        '€' => Some(0x80),
        '‚' => Some(0x82),
        'ƒ' => Some(0x83),
        '„' => Some(0x84),
        '…' => Some(0x85),
        '†' => Some(0x86),
        '‡' => Some(0x87),
        'ˆ' => Some(0x88),
        '‰' => Some(0x89),
        'Š' => Some(0x8A),
        '‹' => Some(0x8B),
        'Œ' => Some(0x8C),
        'Ž' => Some(0x8E),
        '‘' => Some(0x91),
        '’' => Some(0x92),
        '“' => Some(0x93),
        '”' => Some(0x94),
        '•' => Some(0x95),
        '–' => Some(0x96),
        '—' => Some(0x97),
        '˜' => Some(0x98),
        '™' => Some(0x99),
        'š' => Some(0x9A),
        '›' => Some(0x9B),
        'œ' => Some(0x9C),
        'ž' => Some(0x9E),
        'Ÿ' => Some(0x9F),
        _ => None,
    }
 }
 // Maps the range 0x80-0x9F in windows-1252 to unicode.  The remaining
 // characters in windows-1252 match unicode.
 //
 // The '<27>'s stand in for codes not defined in windows-1252, and should be
 // be treated as an error when encountered.
 const DECODE_TABLE: [char; 32] = [
    '€', '<27>', '‚', 'ƒ', '„', '…', '†', '‡', 'ˆ', '‰', 'Š', '‹', 'Œ', '<27>',
    'Ž', '<27>', '<27>', '‘', '’', '“', '”', '•', '–', '—', '˜', '™', 'š', '›',
    'œ', '<27>', 'ž', 'Ÿ',
 ];