Adjusting APIs for text encoding/decoding.

2018-08-19 19:14:13 -07:00 · 2018-08-19 19:14:13 -07:00 · 3ea4e25fbe
commit 3ea4e25fbe
parent 2f5adfad75
5 changed files with 39 additions and 36 deletions
--- a/sub_crates/text_encoding/src/latin1.rs
+++ b/sub_crates/text_encoding/src/latin1.rs
@ -8,7 +8,7 @@
 use std;
 use {DecodeResult, EncodeError, EncodeResult};

-pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
+pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
    // Do the encode.
    let mut input_i = 0;
    let mut output_i = 0;
@ -19,8 +19,8 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
        if c as u32 > 255 {
            return Err(EncodeError {
                character: c,
-                byte_offset: offset,
-                bytes_written: output_i,
+                error_range: (offset, offset + c.len_utf8()),
+                output_bytes_written: output_i,
            });
        }
        output[output_i] = c as u8;
@ -38,7 +38,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
        }
    }

-    Ok((input_i, output_i))
+    Ok((input_i, &output[..output_i]))
 }

 pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
--- a/sub_crates/text_encoding/src/lib.rs
+++ b/sub_crates/text_encoding/src/lib.rs
@ -7,7 +7,11 @@ mod utf16_le;
 mod utf8;

 /// Encodes text from utf8 to a destination encoding.
-pub fn encode_from_utf8(output_encoding: Encoding, input: &str, output: &mut [u8]) -> EncodeResult {
+pub fn encode_from_utf8<'a>(
+    output_encoding: Encoding,
+    input: &str,
+    output: &'a mut [u8],
+) -> EncodeResult<'a> {
    match output_encoding {
        Encoding::Utf8 => utf8::encode_from_utf8(input, output),
        Encoding::Utf16BE => utf16_be::encode_from_utf8(input, output),
@ -48,9 +52,9 @@ pub enum Encoding {

 /// Result type for encoding text from utf8 to a target encoding.
 ///
-/// The Ok() variant provides the number of bytes consumed and the
-/// number of bytes written, in that order.
-pub type EncodeResult = Result<(usize, usize), EncodeError>;
+/// The Ok() variant provides the number of bytes consumed and a reference
+/// to the valid encoded text data.
+pub type EncodeResult<'a> = Result<(usize, &'a [u8]), EncodeError>;

 /// Result type for decoding text from a target encoding to utf8.
 ///
@ -64,17 +68,17 @@ pub type DecodeResult<'a> = Result<(usize, &'a str), DecodeError>;
 /// error is encountering a char that is not representable in the target
 /// encoding.
 ///
-/// The problematic character, the byte offset of that character
-/// in the input utf8, and the number of bytes already written to the output
-/// buffer is provided.
+/// The problematic character, the byte index range of that character in the
+/// input utf8, and the number of bytes already written to the output buffer
+/// are provided.
 ///
 /// It is guaranteed that all input leading up to the problem character has
 /// already been encoded and written to the output buffer.
 #[derive(Debug, Copy, Clone)]
 pub struct EncodeError {
    pub character: char,
-    pub byte_offset: usize,
-    pub bytes_written: usize,
+    pub error_range: (usize, usize),
+    pub output_bytes_written: usize,
 }

 /// Represents an error when decoding to utf8 from some other format.
@ -84,14 +88,13 @@ pub struct EncodeError {
 /// input data that are invalid for the text encoding we're attempting
 /// to decode from.
 ///
-/// The byte offset of the invalid input data and in the number of bytes
-/// already written to the output buffer are.
-/// already been encoded and written to the output buffer.
+/// The byte index range of the invalid input data and the number of bytes
+/// already encoded and written to the output buffer are provided.
 ///
 /// It is guaranteed that all input leading up to the invalid data has
 /// already been encoded and written to the output buffer.
 #[derive(Debug, Copy, Clone)]
 pub struct DecodeError {
-    pub byte_offset: usize,
-    pub bytes_written: usize,
+    pub error_range: (usize, usize),
+    pub output_bytes_written: usize,
 }
--- a/sub_crates/text_encoding/src/utf16_be.rs
+++ b/sub_crates/text_encoding/src/utf16_be.rs
@ -35,7 +35,7 @@ fn from_big_endian(n: [u8; 2]) -> u16 {
    x
 }

-pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
+pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
    // Do the encode.
    let mut input_i = 0;
    let mut output_i = 0;
@ -78,7 +78,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
        }
    }

-    Ok((input_i, output_i))
+    Ok((input_i, &output[..output_i]))
 }

 pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
@ -101,8 +101,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
            } else if (code_1 & 0xFC00) == 0xDC00 {
                // Error: orphaned second half of a surrogate pair.
                return Err(DecodeError {
-                    byte_offset: input_i,
-                    bytes_written: output_i,
+                    error_range: (input_i, input_i + 2),
+                    output_bytes_written: output_i,
                });
            } else {
                // Two code units.
@ -116,8 +116,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
                if !(code_2 & 0xFC00) == 0xDC00 {
                    // Error: second half is not valid surrogate.
                    return Err(DecodeError {
-                        byte_offset: input_i,
-                        bytes_written: output_i,
+                        error_range: (input_i, input_i + 2),
+                        output_bytes_written: output_i,
                    });
                }

--- a/sub_crates/text_encoding/src/utf16_le.rs
+++ b/sub_crates/text_encoding/src/utf16_le.rs
@ -35,7 +35,7 @@ fn from_little_endian(n: [u8; 2]) -> u16 {
    x
 }

-pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
+pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
    // Do the encode.
    let mut input_i = 0;
    let mut output_i = 0;
@ -78,7 +78,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
        }
    }

-    Ok((input_i, output_i))
+    Ok((input_i, &output[..output_i]))
 }

 pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
@ -101,8 +101,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
            } else if (code_1 & 0xFC00) == 0xDC00 {
                // Error: orphaned second half of a surrogate pair.
                return Err(DecodeError {
-                    byte_offset: input_i,
-                    bytes_written: output_i,
+                    error_range: (input_i, input_i + 2),
+                    output_bytes_written: output_i,
                });
            } else {
                // Two code units.
@ -116,8 +116,8 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
                if !(code_2 & 0xFC00) == 0xDC00 {
                    // Error: second half is not valid surrogate.
                    return Err(DecodeError {
-                        byte_offset: input_i,
-                        bytes_written: output_i,
+                        error_range: (input_i, input_i + 2),
+                        output_bytes_written: output_i,
                    });
                }

--- a/sub_crates/text_encoding/src/utf8.rs
+++ b/sub_crates/text_encoding/src/utf8.rs
@ -7,7 +7,7 @@ use std;
 use {DecodeError, DecodeResult, EncodeResult};

 // Encode from utf8
-pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {
+pub fn encode_from_utf8<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {
    let copy_len = {
        if output.len() >= input.len() {
            input.len()
@ -22,7 +22,7 @@ pub fn encode_from_utf8(input: &str, output: &mut [u8]) -> EncodeResult {

    output[..copy_len].copy_from_slice(input[..copy_len].as_bytes());

-    Ok((copy_len, copy_len))
+    Ok((copy_len, &output[..copy_len]))
 }

 pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {
@ -33,19 +33,19 @@ pub fn decode_to_utf8<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a
                e.valid_up_to()
            } else {
                return Err(DecodeError {
-                    byte_offset: 0,
-                    bytes_written: 0,
+                    error_range: (0, 1), // TODO: search for the next starting byte to get the range.
+                    output_bytes_written: 0,
                });
            }
        }
    };

-    let (in_consumed, out_written) = encode_from_utf8(
+    let (in_consumed, out_slice) = encode_from_utf8(
        unsafe { std::str::from_utf8_unchecked(&input[..valid_up_to]) },
        output,
    ).unwrap();

    Ok((in_consumed, unsafe {
-        std::str::from_utf8_unchecked(&output[..out_written])
+        std::str::from_utf8_unchecked(out_slice)
    }))
 }