From 30a14403995bfb80a144df8670faf4be56bb58c6 Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Thu, 23 Aug 2018 13:21:16 -0700 Subject: [PATCH] Tests for single-byte encodings, and related bug fixes. --- sub_crates/text_encoding/build.rs | 2 +- sub_crates/text_encoding/src/lib.rs | 6 +- sub_crates/text_encoding/src/single_byte.rs | 279 +++++++++++- sub_crates/text_encoding/src/windows1252.rs | 410 ------------------ .../text_encoding/tests/property_tests.rs | 20 +- 5 files changed, 280 insertions(+), 437 deletions(-) delete mode 100644 sub_crates/text_encoding/src/windows1252.rs diff --git a/sub_crates/text_encoding/build.rs b/sub_crates/text_encoding/build.rs index 947abbc..fd611d8 100644 --- a/sub_crates/text_encoding/build.rs +++ b/sub_crates/text_encoding/build.rs @@ -151,7 +151,7 @@ fn generate_single_byte_encoding_from_index( let rev_table = { let mut rev_table = vec![]; for (i, c) in table.iter().enumerate() { - rev_table.push((c, i)); + rev_table.push((c, 128 + i)); } rev_table.sort_by_key(|x| x.0); rev_table diff --git a/sub_crates/text_encoding/src/lib.rs b/sub_crates/text_encoding/src/lib.rs index a27df83..9114eb1 100644 --- a/sub_crates/text_encoding/src/lib.rs +++ b/sub_crates/text_encoding/src/lib.rs @@ -11,9 +11,8 @@ mod utf32_be; mod utf32_le; mod utf8; mod utils; -mod windows1252; -use single_byte::{ibm866, iso_8859_2}; +use single_byte::{ibm866, iso_8859_2, iso_8859_7, windows1252}; /// Encodes text from utf8 to a destination encoding. pub fn encode_from_str<'a>( @@ -30,6 +29,7 @@ pub fn encode_from_str<'a>( Encoding::IBM866 => ibm866::encode_from_str(input, output), Encoding::Latin1 => latin1::encode_from_str(input, output), Encoding::ISO8859_2 => iso_8859_2::encode_from_str(input, output), + Encoding::ISO8859_7 => iso_8859_7::encode_from_str(input, output), Encoding::Windows1252 => windows1252::encode_from_str(input, output), } } @@ -49,6 +49,7 @@ pub fn decode_to_str<'a>( Encoding::IBM866 => ibm866::decode_to_str(input, output), Encoding::Latin1 => latin1::decode_to_str(input, output), Encoding::ISO8859_2 => iso_8859_2::decode_to_str(input, output), + Encoding::ISO8859_7 => iso_8859_7::decode_to_str(input, output), Encoding::Windows1252 => windows1252::decode_to_str(input, output), } } @@ -67,6 +68,7 @@ pub enum Encoding { IBM866, // IBM 866 Latin1, // ISO/IEC 8859-1 ISO8859_2, // ISO/IEC 8859-2 + ISO8859_7, // ISO/IEC 8859-7 Windows1252, // Windows code page 1252 } diff --git a/sub_crates/text_encoding/src/single_byte.rs b/sub_crates/text_encoding/src/single_byte.rs index 85973ba..0a1ccc7 100644 --- a/sub_crates/text_encoding/src/single_byte.rs +++ b/sub_crates/text_encoding/src/single_byte.rs @@ -34,10 +34,10 @@ pub mod iso_8859_2 { // include!(concat!(env!("OUT_DIR"), "/iso-8859-6.rs")); // } -// pub mod iso_8859_7 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/iso-8859-7.rs")); -// } +pub mod iso_8859_7 { + // Generated by build.rs + include!(concat!(env!("OUT_DIR"), "/iso-8859-7.rs")); +} // pub mod iso_8859_8 { // // Generated by build.rs @@ -99,10 +99,10 @@ pub mod iso_8859_2 { // include!(concat!(env!("OUT_DIR"), "/windows-1251.rs")); // } -// pub mod windows1252 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/windows-1252.rs")); -// } +pub mod windows1252 { + // Generated by build.rs + include!(concat!(env!("OUT_DIR"), "/windows-1252.rs")); +} // pub mod windows1253 { // // Generated by build.rs @@ -154,16 +154,22 @@ fn single_byte_encode_from_str<'a>( if output_i >= output.len() { break; } - if let Ok(i) = table.binary_search_by_key(&c, |x| x.0) { - output[output_i] = table[i].1; + if c as u32 <= 127 { + output[output_i] = c as u8; output_i += 1; input_i = offset + 1; } else { - return Err(EncodeError { - character: c, - error_range: (offset, offset + c.len_utf8()), - output_bytes_written: output_i, - }); + if let Ok(i) = table.binary_search_by_key(&c, |x| x.0) { + output[output_i] = table[i].1; + output_i += 1; + input_i = offset + 1; + } else { + return Err(EncodeError { + character: c, + error_range: (offset, offset + c.len_utf8()), + output_bytes_written: output_i, + }); + } } } @@ -224,3 +230,246 @@ fn single_byte_decode_to_str<'a>( core::str::from_utf8_unchecked(&output[..output_i]) })) } + +//=========================================================================== + +// Testing is done with iso-8859-7, since it has a few undefined characters, +// allowing us to test handling of those. +#[cfg(test)] +mod tests { + use super::iso_8859_7::*; + use {DecodeError, EncodeError}; + + #[test] + fn encode_01() { + let text = "Hello world!"; + let mut buf = [0u8; 0]; + let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); + assert_eq!(consumed_count, 0); + assert_eq!(encoded, &[]); + } + + #[test] + fn encode_02() { + let text = "Hello world!"; + let mut buf = [0u8; 1]; + let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); + assert_eq!(consumed_count, 1); + assert_eq!(encoded, "H".as_bytes()); + } + + #[test] + fn encode_03() { + let text = "Hello world!"; + let mut buf = [0u8; 2]; + let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); + assert_eq!(consumed_count, 2); + assert_eq!(encoded, "He".as_bytes()); + } + + #[test] + fn encode_04() { + let text = "Hello world!"; + let mut buf = [0u8; 64]; + let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); + assert_eq!(consumed_count, 12); + assert_eq!(encoded, "Hello world!".as_bytes()); + } + + #[test] + fn encode_05() { + let text = "Hello world!こ"; + let mut buf = [0u8; 12]; + let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); + assert_eq!(consumed_count, 12); + assert_eq!(encoded, "Hello world!".as_bytes()); + } + + #[test] + fn decode_01() { + let data = [ + 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, + ]; // "Hello world!" + let mut buf = [0u8; 0]; + let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); + assert_eq!(consumed_count, 0); + assert_eq!(decoded, ""); + } + + #[test] + fn decode_02() { + let data = [ + 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, + ]; // "Hello world!" + let mut buf = [0u8; 1]; + let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); + assert_eq!(consumed_count, 1); + assert_eq!(decoded, "H"); + } + + #[test] + fn decode_03() { + let data = [ + 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, + ]; // "Hello world!" + let mut buf = [0u8; 2]; + let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); + assert_eq!(consumed_count, 2); + assert_eq!(decoded, "He"); + } + + #[test] + fn decode_04() { + let data = [ + 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, + ]; // "Hello world!" + let mut buf = [0u8; 64]; + let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); + assert_eq!(consumed_count, 12); + assert_eq!(decoded, "Hello world!"); + } + + #[test] + fn decode_05() { + let data = [ + 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, + 0xCF, 0xD0, 0xD1, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, + ]; // "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ" + let mut buf = [0u8; 128]; + let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); + assert_eq!(consumed_count, 24); + assert_eq!(decoded, "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ"); + } + + #[test] + fn encode_error_01() { + let text = "こello world!"; + let mut buf = [0u8; 64]; + assert_eq!( + encode_from_str(text, &mut buf), + Err(EncodeError { + character: 'こ', + error_range: (0, 3), + output_bytes_written: 0, + }) + ); + } + + #[test] + fn encode_error_02() { + let text = "\u{00C0}ello world!"; + let mut buf = [0u8; 64]; + assert_eq!( + encode_from_str(text, &mut buf), + Err(EncodeError { + character: '\u{00C0}', + error_range: (0, 2), + output_bytes_written: 0, + }) + ); + } + + #[test] + fn encode_error_03() { + let text = "Hこllo world!"; + let mut buf = [0u8; 64]; + assert_eq!( + encode_from_str(text, &mut buf), + Err(EncodeError { + character: 'こ', + error_range: (1, 4), + output_bytes_written: 1, + }) + ); + } + + #[test] + fn encode_error_04() { + let text = "H\u{00C0}llo world!"; + let mut buf = [0u8; 64]; + assert_eq!( + encode_from_str(text, &mut buf), + Err(EncodeError { + character: '\u{00C0}', + error_range: (1, 3), + output_bytes_written: 1, + }) + ); + } + + #[test] + fn encode_error_05() { + let text = "Heこlo world!"; + let mut buf = [0u8; 3]; + assert_eq!( + encode_from_str(text, &mut buf), + Err(EncodeError { + character: 'こ', + error_range: (2, 5), + output_bytes_written: 2, + }) + ); + } + + #[test] + fn encode_error_06() { + let text = "He\u{00C0}lo world!"; + let mut buf = [0u8; 3]; + assert_eq!( + encode_from_str(text, &mut buf), + Err(EncodeError { + character: '\u{00C0}', + error_range: (2, 4), + output_bytes_written: 2, + }) + ); + } + + #[test] + fn decode_error_01() { + let data = [ + 0x48, 0xAE, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, + ]; // "Hello world!" with an error on the second byte (undefined byte). + let mut buf = [0u8; 64]; + let error = decode_to_str(&data, &mut buf); + assert_eq!( + error, + Err(DecodeError { + error_range: (1, 2), + output_bytes_written: 1, + }) + ); + } + + #[test] + fn decode_error_02() { + let data = [ + 0x48, 0xD2, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, + ]; // "Hello world!" with an error on the second byte (undefined byte). + let mut buf = [0u8; 64]; + let error = decode_to_str(&data, &mut buf); + assert_eq!( + error, + Err(DecodeError { + error_range: (1, 2), + output_bytes_written: 1, + }) + ); + } + + #[test] + fn decode_error_03() { + let data = [ + 0x48, 0xFF, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, + ]; // "Hello world!" with an error on the second byte (undefined byte). + let mut buf = [0u8; 64]; + let error = decode_to_str(&data, &mut buf); + assert_eq!( + error, + Err(DecodeError { + error_range: (1, 2), + output_bytes_written: 1, + }) + ); + } +} diff --git a/sub_crates/text_encoding/src/windows1252.rs b/sub_crates/text_encoding/src/windows1252.rs deleted file mode 100644 index 9f929c5..0000000 --- a/sub_crates/text_encoding/src/windows1252.rs +++ /dev/null @@ -1,410 +0,0 @@ -//! Encoding/decoding functions for Windows-1252. - -use core; -use {DecodeError, DecodeResult, EncodeError, EncodeResult}; - -pub fn encode_from_str<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { - // Do the encode. - let mut input_i = 0; - let mut output_i = 0; - for (offset, c) in input.char_indices() { - if output_i >= output.len() { - break; - } - if let Some(byte) = encode_table(c) { - output[output_i] = byte; - output_i += 1; - input_i = offset + 1; - } else { - return Err(EncodeError { - character: c, - error_range: (offset, offset + c.len_utf8()), - output_bytes_written: output_i, - }); - } - } - - // Calculate how much of the input was consumed. - if input_i > input.len() { - input_i = input.len(); - } else { - while !input.is_char_boundary(input_i) { - input_i += 1; - } - } - - Ok((input_i, &output[..output_i])) -} - -pub fn decode_to_str<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { - let mut input_i = 0; - let mut output_i = 0; - for &byte in input.iter() { - if byte < 0x80 { - // 1-byte case - if output_i >= output.len() { - break; - } - output[output_i] = byte; - input_i += 1; - output_i += 1; - } else if byte < 0xA0 { - // Use lookup table. - let code = DECODE_TABLE[byte as usize - 0x80]; - if code == '�' { - // Error: undefined byte. - return Err(DecodeError { - error_range: (input_i, input_i + 1), - output_bytes_written: output_i, - }); - } - // Encode to utf8 - let mut buf = [0u8; 4]; - let s = code.encode_utf8(&mut buf); - if (output_i + s.len()) > output.len() { - break; - } - output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes()); - input_i += 1; - output_i += s.len(); - } else { - // Non-lookup-table 2-byte case - if (output_i + 1) >= output.len() { - break; - } - output[output_i] = 0b11000000 | (byte >> 6); - output[output_i + 1] = 0b10000000 | (byte & 0b00111111); - input_i += 1; - output_i += 2; - } - } - - Ok((input_i, unsafe { - core::str::from_utf8_unchecked(&output[..output_i]) - })) -} - -// Maps unicode to windows-1252. -// -// Returns `None` for characters not in windows-1252. -#[inline(always)] -fn encode_table(code: char) -> Option { - if (code as u32) < 0x80 || ((code as u32) > 0x9F && (code as u32) <= 0xFF) { - return Some(code as u8); - } - match code { - '\u{20AC}' => Some(0x80), - '\u{201A}' => Some(0x82), - '\u{0192}' => Some(0x83), - '\u{201E}' => Some(0x84), - '\u{2026}' => Some(0x85), - '\u{2020}' => Some(0x86), - '\u{2021}' => Some(0x87), - '\u{02C6}' => Some(0x88), - '\u{2030}' => Some(0x89), - '\u{0160}' => Some(0x8A), - '\u{2039}' => Some(0x8B), - '\u{0152}' => Some(0x8C), - '\u{017D}' => Some(0x8E), - '\u{2018}' => Some(0x91), - '\u{2019}' => Some(0x92), - '\u{201C}' => Some(0x93), - '\u{201D}' => Some(0x94), - '\u{2022}' => Some(0x95), - '\u{2013}' => Some(0x96), - '\u{2014}' => Some(0x97), - '\u{02DC}' => Some(0x98), - '\u{2122}' => Some(0x99), - '\u{0161}' => Some(0x9A), - '\u{203A}' => Some(0x9B), - '\u{0153}' => Some(0x9C), - '\u{017E}' => Some(0x9E), - '\u{0178}' => Some(0x9F), - _ => None, - } -} - -// Maps the range 0x80-0x9F in windows-1252 to unicode. The remaining -// characters in windows-1252 match unicode. -// -// The '�'s stand in for codes not defined in windows-1252, and should be -// be treated as an error when encountered. -const DECODE_TABLE: [char; 32] = [ - '\u{20AC}', '�', '\u{201A}', '\u{0192}', '\u{201E}', '\u{2026}', '\u{2020}', '\u{2021}', - '\u{02C6}', '\u{2030}', '\u{0160}', '\u{2039}', '\u{0152}', '�', '\u{017D}', '�', '�', - '\u{2018}', '\u{2019}', '\u{201C}', '\u{201D}', '\u{2022}', '\u{2013}', '\u{2014}', '\u{02DC}', - '\u{2122}', '\u{0161}', '\u{203A}', '\u{0153}', '�', '\u{017E}', '\u{0178}', -]; - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn encode_01() { - let text = "Hello world!"; - let mut buf = [0u8; 0]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(encoded, &[]); - } - - #[test] - fn encode_02() { - let text = "Hello world!"; - let mut buf = [0u8; 1]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 1); - assert_eq!(encoded, "H".as_bytes()); - } - - #[test] - fn encode_03() { - let text = "Hello world!"; - let mut buf = [0u8; 2]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 2); - assert_eq!(encoded, "He".as_bytes()); - } - - #[test] - fn encode_04() { - let text = "Hello world!"; - let mut buf = [0u8; 64]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 12); - assert_eq!(encoded, "Hello world!".as_bytes()); - } - - #[test] - fn encode_05() { - let text = "Hello world!こ"; - let mut buf = [0u8; 12]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 12); - assert_eq!(encoded, "Hello world!".as_bytes()); - } - - #[test] - fn decode_01() { - let data = [ - 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" - let mut buf = [0u8; 0]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(decoded, ""); - } - - #[test] - fn decode_02() { - let data = [ - 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" - let mut buf = [0u8; 1]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 1); - assert_eq!(decoded, "H"); - } - - #[test] - fn decode_03() { - let data = [ - 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" - let mut buf = [0u8; 2]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 2); - assert_eq!(decoded, "He"); - } - - #[test] - fn decode_04() { - let data = [ - 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 12); - assert_eq!(decoded, "Hello world!"); - } - - #[test] - fn decode_05() { - let data = [ - 0x80, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8E, 0x91, - 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9E, 0x9F, - ]; // "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ", all of the non-latin1 matching characters. - let mut buf = [0u8; 128]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 27); - assert_eq!( - decoded, - "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ" - ); - } - - #[test] - fn encode_error_01() { - let text = "こello world!"; - let mut buf = [0u8; 64]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: 'こ', - error_range: (0, 3), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn encode_error_02() { - let text = "\u{0085}ello world!"; - let mut buf = [0u8; 64]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: '\u{0085}', - error_range: (0, 2), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn encode_error_03() { - let text = "Hこllo world!"; - let mut buf = [0u8; 64]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: 'こ', - error_range: (1, 4), - output_bytes_written: 1, - }) - ); - } - - #[test] - fn encode_error_04() { - let text = "H\u{0085}llo world!"; - let mut buf = [0u8; 64]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: '\u{0085}', - error_range: (1, 3), - output_bytes_written: 1, - }) - ); - } - - #[test] - fn encode_error_05() { - let text = "Heこlo world!"; - let mut buf = [0u8; 3]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: 'こ', - error_range: (2, 5), - output_bytes_written: 2, - }) - ); - } - - #[test] - fn encode_error_06() { - let text = "He\u{0085}lo world!"; - let mut buf = [0u8; 3]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: '\u{0085}', - error_range: (2, 4), - output_bytes_written: 2, - }) - ); - } - - #[test] - fn decode_error_01() { - let data = [ - 0x48, 0x81, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" with an error on the second byte (undefined byte). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (1, 2), - output_bytes_written: 1, - }) - ); - } - - #[test] - fn decode_error_02() { - let data = [ - 0x48, 0x8D, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" with an error on the second byte (undefined byte). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (1, 2), - output_bytes_written: 1, - }) - ); - } - - #[test] - fn decode_error_03() { - let data = [ - 0x48, 0x8F, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" with an error on the second byte (undefined byte). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (1, 2), - output_bytes_written: 1, - }) - ); - } - - #[test] - fn decode_error_04() { - let data = [ - 0x48, 0x90, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" with an error on the second byte (undefined byte). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (1, 2), - output_bytes_written: 1, - }) - ); - } - - #[test] - fn decode_error_05() { - let data = [ - 0x48, 0x9D, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" with an error on the second byte (undefined byte). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (1, 2), - output_bytes_written: 1, - }) - ); - } -} diff --git a/sub_crates/text_encoding/tests/property_tests.rs b/sub_crates/text_encoding/tests/property_tests.rs index 9f76cf0..bc8171c 100644 --- a/sub_crates/text_encoding/tests/property_tests.rs +++ b/sub_crates/text_encoding/tests/property_tests.rs @@ -197,23 +197,25 @@ proptest! { assert_eq!(&data[..], &latin1[..]); } + // The iso-8859-7 tests are representative of all single-byte encodings + // (except latin1) since they're all generated and share their code. #[test] - fn pt_windows1252_roundtrip(mut data in vec(0u8..=255, 0..1000)) { + fn pt_iso_8859_7_roundtrip(mut data in vec(0u8..=255, 0..1000)) { let mut buf = [0u8; 32]; let mut utf8 = String::new(); - let mut w1252: Vec = Vec::new(); + let mut iso8859_7: Vec = Vec::new(); // Eliminate undefined bytes in input. for b in data.iter_mut() { - if *b == 0x81 || *b == 0x8D || *b == 0x8F || *b == 0x90 || *b == 0x9D { + if *b == 0xAE || *b == 0xD2 || *b == 0xFF { *b = 0; } } - // Decode from windows-1252 to utf8 + // Decode from iso-8859-7 to utf8 let mut tmp = &data[..]; while !tmp.is_empty() { - if let Ok((n, decoded)) = decode_to_str(Encoding::Windows1252, tmp, &mut buf) { + if let Ok((n, decoded)) = decode_to_str(Encoding::ISO8859_7, tmp, &mut buf) { tmp = &tmp[n..]; utf8.extend(decoded.chars()); } else { @@ -221,17 +223,17 @@ proptest! { } } - // Encode to from utf8 back to w1252 + // Encode to from utf8 back to iso-8859-7 let mut tmp = &utf8[..]; while !tmp.is_empty() { - if let Ok((n, encoded)) = encode_from_str(Encoding::Windows1252, tmp, &mut buf) { + if let Ok((n, encoded)) = encode_from_str(Encoding::ISO8859_7, tmp, &mut buf) { tmp = &tmp[n..]; - w1252.extend_from_slice(encoded); + iso8859_7.extend_from_slice(encoded); } else { panic!("Error when encoding."); } } - assert_eq!(&data[..], &w1252[..]); + assert_eq!(&data[..], &iso8859_7[..]); } }