Added Windows-1252 support to text_encoding sub-crate.

This commit is contained in:
Nathan Vegdahl 2018-08-20 21:09:01 -07:00
parent 3ea4e25fbe
commit a82b06794e
2 changed files with 145 additions and 6 deletions

View File

@ -5,6 +5,7 @@ mod latin1;
mod utf16_be;
mod utf16_le;
mod utf8;
mod windows1252;
/// Encodes text from utf8 to a destination encoding.
pub fn encode_from_utf8<'a>(
@ -17,7 +18,7 @@ pub fn encode_from_utf8<'a>(
Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output),
Encoding::Latin1 => latin1::encode_from_utf8(input, output),
_ => unimplemented!(),
Encoding::Windows1252 => windows1252::encode_from_utf8(input, output),
}
}
@ -32,7 +33,7 @@ pub fn decode_to_utf8<'a>(
Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output),
Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output),
Encoding::Latin1 => latin1::decode_to_utf8(input, output),
_ => unimplemented!(),
Encoding::Windows1252 => windows1252::decode_to_utf8(input, output),
}
}
@ -42,10 +43,11 @@ pub enum Encoding {
Utf8,
Utf16BE, // Big endian
Utf16LE, // Little endian
Utf32BE, // Big endian
Utf32LE, // Little endian
ShiftJIS,
Big5,
// Utf32BE, // Big endian
// Utf32LE, // Little endian
// ShiftJIS,
// EUC_JP,
// Big5,
Latin1, // ISO/IEC 8859-1
Windows1252, // Windows code page 1252
}

View File

@ -0,0 +1,137 @@
//! Encoding/decoding functions for Windows-1252.
use std;
use {DecodeError, DecodeResult, EncodeError, EncodeResult};
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
// Do the encode.
let mut input_i = 0;
let mut output_i = 0;
for (offset, c) in input.char_indices() {
if output_i >= output.len() {
break;
}
if let Some(byte) = encode_table(c) {
output[output_i] = byte;
output_i += 1;
input_i = offset;
} else {
return Err(EncodeError {
character: c,
error_range: (offset, offset + c.len_utf8()),
output_bytes_written: output_i,
});
}
}
// Calculate how much of the input was consumed.
input_i += 1;
if input_i > input.len() {
input_i = input.len();
} else {
while !input.is_char_boundary(input_i) {
input_i += 1;
}
}
Ok((input_i, &output[..output_i]))
}
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
let mut input_i = 0;
let mut output_i = 0;
for &byte in input.iter() {
if byte < 0x80 {
// 1-byte case
if output_i >= output.len() {
break;
}
output[output_i] = byte;
input_i += 1;
output_i += 1;
} else if byte < 0xA0 {
// Use lookup table.
let code = DECODE_TABLE[byte as usize - 0x80];
if code == '<27>' {
// Error: undefined byte.
return Err(DecodeError {
error_range: (input_i, input_i + 1),
output_bytes_written: output_i,
});
}
// Encode to utf8
let mut buf = [0u8; 4];
let s = code.encode_utf8(&mut buf);
if (output_i + s.len()) > output.len() {
break;
}
output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
input_i += 1;
output_i += s.len();
} else {
// Non-lookup-table 2-byte case
if (output_i + 1) >= output.len() {
break;
}
output[output_i] = 0b11000000 | (byte >> 6);
output[output_i + 1] = 0b10000000 | (byte & 0b00111111);
input_i += 1;
output_i += 2;
}
}
Ok((input_i, unsafe {
std::str::from_utf8_unchecked(&output[..output_i])
}))
}
// Maps unicode to windows-1252.
//
// Returns `None` for characters not in windows-1252.
#[inline(always)]
fn encode_table(code: char) -> Option<u8> {
if (code as u32) < 0x80 || ((code as u32) > 0x9F && (code as u32) <= 0xFF) {
return Some(code as u8);
}
match code {
'€' => Some(0x80),
'' => Some(0x82),
'ƒ' => Some(0x83),
'„' => Some(0x84),
'…' => Some(0x85),
'†' => Some(0x86),
'‡' => Some(0x87),
'ˆ' => Some(0x88),
'‰' => Some(0x89),
'Š' => Some(0x8A),
'' => Some(0x8B),
'Œ' => Some(0x8C),
'Ž' => Some(0x8E),
'' => Some(0x91),
'' => Some(0x92),
'“' => Some(0x93),
'”' => Some(0x94),
'•' => Some(0x95),
'' => Some(0x96),
'—' => Some(0x97),
'˜' => Some(0x98),
'™' => Some(0x99),
'š' => Some(0x9A),
'' => Some(0x9B),
'œ' => Some(0x9C),
'ž' => Some(0x9E),
'Ÿ' => Some(0x9F),
_ => None,
}
}
// Maps the range 0x80-0x9F in windows-1252 to unicode. The remaining
// characters in windows-1252 match unicode.
//
// The '<27>'s stand in for codes not defined in windows-1252, and should be
// be treated as an error when encountered.
const DECODE_TABLE: [char; 32] = [
'€', '<27>', '', 'ƒ', '„', '…', '†', '‡', 'ˆ', '‰', 'Š', '', 'Œ', '<27>',
'Ž', '<27>', '<27>', '', '', '“', '”', '•', '', '—', '˜', '™', 'š', '',
'œ', '<27>', 'ž', 'Ÿ',
];