Added Windows-1252 support to text_encoding sub-crate.
This commit is contained in:
parent
3ea4e25fbe
commit
a82b06794e
|
@ -5,6 +5,7 @@ mod latin1;
|
||||||
mod utf16_be;
|
mod utf16_be;
|
||||||
mod utf16_le;
|
mod utf16_le;
|
||||||
mod utf8;
|
mod utf8;
|
||||||
|
mod windows1252;
|
||||||
|
|
||||||
/// Encodes text from utf8 to a destination encoding.
|
/// Encodes text from utf8 to a destination encoding.
|
||||||
pub fn encode_from_utf8<'a>(
|
pub fn encode_from_utf8<'a>(
|
||||||
|
@ -17,7 +18,7 @@ pub fn encode_from_utf8<'a>(
|
||||||
Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
|
Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
|
||||||
Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output),
|
Encoding::Utf16LE => utf16_le::encode_from_utf8(input, output),
|
||||||
Encoding::Latin1 => latin1::encode_from_utf8(input, output),
|
Encoding::Latin1 => latin1::encode_from_utf8(input, output),
|
||||||
_ => unimplemented!(),
|
Encoding::Windows1252 => windows1252::encode_from_utf8(input, output),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,7 +33,7 @@ pub fn decode_to_utf8<'a>(
|
||||||
Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output),
|
Encoding::Utf16BE => utf16_be::decode_to_utf8(input, output),
|
||||||
Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output),
|
Encoding::Utf16LE => utf16_le::decode_to_utf8(input, output),
|
||||||
Encoding::Latin1 => latin1::decode_to_utf8(input, output),
|
Encoding::Latin1 => latin1::decode_to_utf8(input, output),
|
||||||
_ => unimplemented!(),
|
Encoding::Windows1252 => windows1252::decode_to_utf8(input, output),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,10 +43,11 @@ pub enum Encoding {
|
||||||
Utf8,
|
Utf8,
|
||||||
Utf16BE, // Big endian
|
Utf16BE, // Big endian
|
||||||
Utf16LE, // Little endian
|
Utf16LE, // Little endian
|
||||||
Utf32BE, // Big endian
|
// Utf32BE, // Big endian
|
||||||
Utf32LE, // Little endian
|
// Utf32LE, // Little endian
|
||||||
ShiftJIS,
|
// ShiftJIS,
|
||||||
Big5,
|
// EUC_JP,
|
||||||
|
// Big5,
|
||||||
Latin1, // ISO/IEC 8859-1
|
Latin1, // ISO/IEC 8859-1
|
||||||
Windows1252, // Windows code page 1252
|
Windows1252, // Windows code page 1252
|
||||||
}
|
}
|
||||||
|
|
137
sub_crates/text_encoding/src/windows1252.rs
Normal file
137
sub_crates/text_encoding/src/windows1252.rs
Normal file
|
@ -0,0 +1,137 @@
|
||||||
|
//! Encoding/decoding functions for Windows-1252.
|
||||||
|
|
||||||
|
use std;
|
||||||
|
use {DecodeError, DecodeResult, EncodeError, EncodeResult};
|
||||||
|
|
||||||
|
pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
|
||||||
|
// Do the encode.
|
||||||
|
let mut input_i = 0;
|
||||||
|
let mut output_i = 0;
|
||||||
|
for (offset, c) in input.char_indices() {
|
||||||
|
if output_i >= output.len() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if let Some(byte) = encode_table(c) {
|
||||||
|
output[output_i] = byte;
|
||||||
|
output_i += 1;
|
||||||
|
input_i = offset;
|
||||||
|
} else {
|
||||||
|
return Err(EncodeError {
|
||||||
|
character: c,
|
||||||
|
error_range: (offset, offset + c.len_utf8()),
|
||||||
|
output_bytes_written: output_i,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate how much of the input was consumed.
|
||||||
|
input_i += 1;
|
||||||
|
if input_i > input.len() {
|
||||||
|
input_i = input.len();
|
||||||
|
} else {
|
||||||
|
while !input.is_char_boundary(input_i) {
|
||||||
|
input_i += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok((input_i, &output[..output_i]))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
|
||||||
|
let mut input_i = 0;
|
||||||
|
let mut output_i = 0;
|
||||||
|
for &byte in input.iter() {
|
||||||
|
if byte < 0x80 {
|
||||||
|
// 1-byte case
|
||||||
|
if output_i >= output.len() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
output[output_i] = byte;
|
||||||
|
input_i += 1;
|
||||||
|
output_i += 1;
|
||||||
|
} else if byte < 0xA0 {
|
||||||
|
// Use lookup table.
|
||||||
|
let code = DECODE_TABLE[byte as usize - 0x80];
|
||||||
|
if code == '<27>' {
|
||||||
|
// Error: undefined byte.
|
||||||
|
return Err(DecodeError {
|
||||||
|
error_range: (input_i, input_i + 1),
|
||||||
|
output_bytes_written: output_i,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
// Encode to utf8
|
||||||
|
let mut buf = [0u8; 4];
|
||||||
|
let s = code.encode_utf8(&mut buf);
|
||||||
|
if (output_i + s.len()) > output.len() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes());
|
||||||
|
input_i += 1;
|
||||||
|
output_i += s.len();
|
||||||
|
} else {
|
||||||
|
// Non-lookup-table 2-byte case
|
||||||
|
if (output_i + 1) >= output.len() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
output[output_i] = 0b11000000 | (byte >> 6);
|
||||||
|
output[output_i + 1] = 0b10000000 | (byte & 0b00111111);
|
||||||
|
input_i += 1;
|
||||||
|
output_i += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok((input_i, unsafe {
|
||||||
|
std::str::from_utf8_unchecked(&output[..output_i])
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Maps unicode to windows-1252.
|
||||||
|
//
|
||||||
|
// Returns `None` for characters not in windows-1252.
|
||||||
|
#[inline(always)]
|
||||||
|
fn encode_table(code: char) -> Option<u8> {
|
||||||
|
if (code as u32) < 0x80 || ((code as u32) > 0x9F && (code as u32) <= 0xFF) {
|
||||||
|
return Some(code as u8);
|
||||||
|
}
|
||||||
|
match code {
|
||||||
|
'€' => Some(0x80),
|
||||||
|
'‚' => Some(0x82),
|
||||||
|
'ƒ' => Some(0x83),
|
||||||
|
'„' => Some(0x84),
|
||||||
|
'…' => Some(0x85),
|
||||||
|
'†' => Some(0x86),
|
||||||
|
'‡' => Some(0x87),
|
||||||
|
'ˆ' => Some(0x88),
|
||||||
|
'‰' => Some(0x89),
|
||||||
|
'Š' => Some(0x8A),
|
||||||
|
'‹' => Some(0x8B),
|
||||||
|
'Œ' => Some(0x8C),
|
||||||
|
'Ž' => Some(0x8E),
|
||||||
|
'‘' => Some(0x91),
|
||||||
|
'’' => Some(0x92),
|
||||||
|
'“' => Some(0x93),
|
||||||
|
'”' => Some(0x94),
|
||||||
|
'•' => Some(0x95),
|
||||||
|
'–' => Some(0x96),
|
||||||
|
'—' => Some(0x97),
|
||||||
|
'˜' => Some(0x98),
|
||||||
|
'™' => Some(0x99),
|
||||||
|
'š' => Some(0x9A),
|
||||||
|
'›' => Some(0x9B),
|
||||||
|
'œ' => Some(0x9C),
|
||||||
|
'ž' => Some(0x9E),
|
||||||
|
'Ÿ' => Some(0x9F),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Maps the range 0x80-0x9F in windows-1252 to unicode. The remaining
|
||||||
|
// characters in windows-1252 match unicode.
|
||||||
|
//
|
||||||
|
// The '<27>'s stand in for codes not defined in windows-1252, and should be
|
||||||
|
// be treated as an error when encountered.
|
||||||
|
const DECODE_TABLE: [char; 32] = [
|
||||||
|
'€', '<27>', '‚', 'ƒ', '„', '…', '†', '‡', 'ˆ', '‰', 'Š', '‹', 'Œ', '<27>',
|
||||||
|
'Ž', '<27>', '<27>', '‘', '’', '“', '”', '•', '–', '—', '˜', '™', 'š', '›',
|
||||||
|
'œ', '<27>', 'ž', 'Ÿ',
|
||||||
|
];
|
Loading…
Reference in New Issue
Block a user