diff --git a/sub_crates/text_encoding/src/lib.rs b/sub_crates/text_encoding/src/lib.rs index d4d0628..60cb7a3 100644 --- a/sub_crates/text_encoding/src/lib.rs +++ b/sub_crates/text_encoding/src/lib.rs @@ -5,6 +5,7 @@ mod latin1; mod utf16_be; mod utf16_le; mod utf8; +mod windows1252; /// Encodes text from utf8 to a destination encoding. pub fn encode_from_utf8<'a>( @@ -17,7 +18,7 @@ pub fn encode_from_utf8<'a>( Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output), Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output), Encoding::Latin1 => latin1::encode_from_utf8(input, output), - _ => unimplemented!(), + Encoding::Windows1252 => windows1252::encode_from_utf8(input, output), } } @@ -32,7 +33,7 @@ pub fn decode_to_utf8<'a>( Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output), Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output), Encoding::Latin1 => latin1::decode_to_utf8(input, output), - _ => unimplemented!(), + Encoding::Windows1252 => windows1252::decode_to_utf8(input, output), } } @@ -42,10 +43,11 @@ pub enum Encoding { Utf8, Utf16BE, // Big endian Utf16LE, // Little endian - Utf32BE, // Big endian - Utf32LE, // Little endian - ShiftJIS, - Big5, + // Utf32BE, // Big endian + // Utf32LE, // Little endian + // ShiftJIS, + // EUC_JP, + // Big5, Latin1, // ISO/IEC 8859-1 Windows1252, // Windows code page 1252 } diff --git a/sub_crates/text_encoding/src/windows1252.rs b/sub_crates/text_encoding/src/windows1252.rs new file mode 100644 index 0000000..3e407e7 --- /dev/null +++ b/sub_crates/text_encoding/src/windows1252.rs @@ -0,0 +1,137 @@ +//! Encoding/decoding functions for Windows-1252. + +use std; +use {DecodeError, DecodeResult, EncodeError, EncodeResult}; + +pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { + // Do the encode. + let mut input_i = 0; + let mut output_i = 0; + for (offset, c) in input.char_indices() { + if output_i >= output.len() { + break; + } + if let Some(byte) = encode_table(c) { + output[output_i] = byte; + output_i += 1; + input_i = offset; + } else { + return Err(EncodeError { + character: c, + error_range: (offset, offset + c.len_utf8()), + output_bytes_written: output_i, + }); + } + } + + // Calculate how much of the input was consumed. + input_i += 1; + if input_i > input.len() { + input_i = input.len(); + } else { + while !input.is_char_boundary(input_i) { + input_i += 1; + } + } + + Ok((input_i, &output[..output_i])) +} + +pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { + let mut input_i = 0; + let mut output_i = 0; + for &byte in input.iter() { + if byte < 0x80 { + // 1-byte case + if output_i >= output.len() { + break; + } + output[output_i] = byte; + input_i += 1; + output_i += 1; + } else if byte < 0xA0 { + // Use lookup table. + let code = DECODE_TABLE[byte as usize - 0x80]; + if code == '�' { + // Error: undefined byte. + return Err(DecodeError { + error_range: (input_i, input_i + 1), + output_bytes_written: output_i, + }); + } + // Encode to utf8 + let mut buf = [0u8; 4]; + let s = code.encode_utf8(&mut buf); + if (output_i + s.len()) > output.len() { + break; + } + output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes()); + input_i += 1; + output_i += s.len(); + } else { + // Non-lookup-table 2-byte case + if (output_i + 1) >= output.len() { + break; + } + output[output_i] = 0b11000000 | (byte >> 6); + output[output_i + 1] = 0b10000000 | (byte & 0b00111111); + input_i += 1; + output_i += 2; + } + } + + Ok((input_i, unsafe { + std::str::from_utf8_unchecked(&output[..output_i]) + })) +} + +// Maps unicode to windows-1252. +// +// Returns `None` for characters not in windows-1252. +#[inline(always)] +fn encode_table(code: char) -> Option { + if (code as u32) < 0x80 || ((code as u32) > 0x9F && (code as u32) <= 0xFF) { + return Some(code as u8); + } + match code { + '€' => Some(0x80), + '‚' => Some(0x82), + 'ƒ' => Some(0x83), + '„' => Some(0x84), + '…' => Some(0x85), + '†' => Some(0x86), + '‡' => Some(0x87), + 'ˆ' => Some(0x88), + '‰' => Some(0x89), + 'Š' => Some(0x8A), + '‹' => Some(0x8B), + 'Œ' => Some(0x8C), + 'Ž' => Some(0x8E), + '‘' => Some(0x91), + '’' => Some(0x92), + '“' => Some(0x93), + '”' => Some(0x94), + '•' => Some(0x95), + '–' => Some(0x96), + '—' => Some(0x97), + '˜' => Some(0x98), + '™' => Some(0x99), + 'š' => Some(0x9A), + '›' => Some(0x9B), + 'œ' => Some(0x9C), + 'ž' => Some(0x9E), + 'Ÿ' => Some(0x9F), + _ => None, + } +} + +// Maps the range 0x80-0x9F in windows-1252 to unicode. The remaining +// characters in windows-1252 match unicode. +// +// The '�'s stand in for codes not defined in windows-1252, and should be +// be treated as an error when encountered. +const DECODE_TABLE: [char; 32] = [ + '€', '�', '‚', 'ƒ', '„', '…', '†', '‡', 'ˆ', '‰', 'Š', '‹', 'Œ', '�', + 'Ž', '�', '�', '‘', '’', '“', '”', '•', '–', '—', '˜', '™', 'š', '›', + 'œ', '�', 'ž', 'Ÿ', +];