diff --git a/Cargo.lock b/Cargo.lock index 4342648..539a53b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9,7 +9,6 @@ dependencies = [ "serde_derive 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", "smallvec 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", "termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)", - "text_encoding 0.1.0", "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -30,37 +29,6 @@ dependencies = [ "unicode-segmentation 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "bit-set" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "bit-vec 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "bit-vec" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "bitflags" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "byteorder" -version = "1.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "cloudabi" -version = "0.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "bitflags 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "docopt" version = "0.8.3" @@ -73,25 +41,6 @@ dependencies = [ "strsim 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "fnv" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "fuchsia-zircon" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "bitflags 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", - "fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "fuchsia-zircon-sys" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "lazy_static" version = "1.0.2" @@ -110,11 +59,6 @@ dependencies = [ "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "num-traits" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "proc-macro2" version = "0.4.9" @@ -123,28 +67,6 @@ dependencies = [ "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "proptest" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "bit-set 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", - "bitflags 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", - "byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)", - "lazy_static 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", - "num-traits 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)", - "quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)", - "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", - "regex-syntax 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", - "rusty-fork 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", - "tempfile 3.0.3 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "quick-error" -version = "1.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "quote" version = "0.6.4" @@ -153,23 +75,6 @@ dependencies = [ "proc-macro2 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "rand" -version = "0.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)", - "fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", - "rand_core 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "rand_core" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "redox_syscall" version = "0.1.40" @@ -203,22 +108,6 @@ dependencies = [ "ucd-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "regex-syntax" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "ucd-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "remove_dir_all" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "ropey" version = "0.8.4" @@ -227,17 +116,6 @@ dependencies = [ "smallvec 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "rusty-fork" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)", - "quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)", - "tempfile 3.0.3 (registry+https://github.com/rust-lang/crates.io-index)", - "wait-timeout 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "serde" version = "1.0.70" @@ -276,18 +154,6 @@ dependencies = [ "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "tempfile" -version = "3.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", - "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", - "redox_syscall 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)", - "remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "termion" version = "1.5.1" @@ -298,13 +164,6 @@ dependencies = [ "redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "text_encoding" -version = "0.1.0" -dependencies = [ - "proptest 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)", -] - [[package]] name = "thread_local" version = "0.3.5" @@ -352,68 +211,24 @@ name = "void" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "wait-timeout" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "winapi" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" - [metadata] "checksum aho-corasick 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c1c6d463cbe7ed28720b5b489e7c083eeb8f90d08be2a0d6bb9e1ffea9ce1afa" -"checksum bit-set 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "6f1efcc46c18245a69c38fcc5cc650f16d3a59d034f3106e9ed63748f695730a" -"checksum bit-vec 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4440d5cb623bb7390ae27fec0bb6c61111969860f8e3ae198bfa0663645e67cf" -"checksum bitflags 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "d0c54bb8f454c567f21197eefcdbf5679d0bd99f2ddbe52e84c77061952e6789" -"checksum byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8389c509ec62b9fe8eca58c502a0acaf017737355615243496cde4994f8fa4f9" -"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" "checksum docopt 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)" = "d8acd393692c503b168471874953a2531df0e9ab77d0b6bbc582395743300a4a" -"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3" -"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" -"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum lazy_static 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "fb497c35d362b6a331cfd94956a07fc2c78a4604cdbee844a81170386b996dd3" "checksum libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)" = "b685088df2b950fccadf07a7187c8ef846a959c142338a48f9dc0b94517eb5f1" "checksum memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "796fba70e76612589ed2ce7f45282f5af869e0fdd7cc6199fa1aa1f1d591ba9d" -"checksum num-traits 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "630de1ef5cc79d0cdd78b7e33b81f083cbfe90de0f4b2b2f07f905867c70e9fe" "checksum proc-macro2 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)" = "cccdc7557a98fe98453030f077df7f3a042052fae465bb61d2c2c41435cfd9b6" -"checksum proptest 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "56f423fe98260316065f96eda6fcb2b892d08114a77ad753e4a257c5303ce0fc" -"checksum quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9274b940887ce9addde99c4eee6b5c44cc494b182b97e73dc8ffdcb3397fd3f0" "checksum quote 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b71f9f575d55555aa9c06188be9d4e2bfc83ed02537948ac0d520c24d0419f1a" -"checksum rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)" = "e464cd887e869cddcae8792a4ee31d23c7edd516700695608f5b98c67ee0131c" -"checksum rand_core 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "edecf0f94da5551fc9b492093e30b041a891657db7940ee221f9d2f66e82eef2" "checksum redox_syscall 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)" = "c214e91d3ecf43e9a4e41e578973adeb14b474f2bee858742d127af75a0112b1" "checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" "checksum regex 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "9329abc99e39129fcceabd24cf5d85b4671ef7c29c50e972bc5afe32438ec384" "checksum regex-syntax 0.5.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7d707a4fa2637f2dca2ef9fd02225ec7661fe01a53623c1e6515b6916511f7a7" -"checksum regex-syntax 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "747ba3b235651f6e2f67dfa8bcdcd073ddb7c243cb21c442fc12395dfcac212d" -"checksum remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3488ba1b9a2084d38645c4c08276a1752dcbf2c7130d74f1569681ad5d2799c5" "checksum ropey 0.8.4 (git+https://github.com/cessen/ropey)" = "" -"checksum rusty-fork 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ea98d8d2644fd8b4946a2be90e8c6dc52b652e03079c46e134d9815062b9082d" "checksum serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)" = "0c3adf19c07af6d186d91dae8927b83b0553d07ca56cbf7f2f32560455c91920" "checksum serde_derive 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)" = "3525a779832b08693031b8ecfb0de81cd71cfd3812088fafe9a7496789572124" "checksum smallvec 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "26df3bb03ca5eac2e64192b723d51f56c1b1e0860e7c766281f4598f181acdc8" "checksum strsim 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b4d15c810519a91cf877e7e36e63fe068815c678181439f2f29e2562147c3694" "checksum syn 0.14.5 (registry+https://github.com/rust-lang/crates.io-index)" = "4bad7abdf6633f07c7046b90484f1d9dc055eca39f8c991177b1046ce61dba9a" -"checksum tempfile 3.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "c4b103c6d08d323b92ff42c8ce62abcd83ca8efa7fd5bf7927efefec75f58c76" "checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096" "checksum thread_local 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "279ef31c19ededf577bfd12dfae728040a21f635b06a24cd670ff510edd38963" "checksum ucd-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "fd2be2d6639d0f8fe6cdda291ad456e23629558d466e2789d2c3e9892bda285d" @@ -423,7 +238,3 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" "checksum utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "662fab6525a98beff2921d7f61a39e7d59e0b425ebc7d0d9e66d316e55124122" "checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" -"checksum wait-timeout 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "b9f3bf741a801531993db6478b95682117471f76916f5e690dd8d45395b09349" -"checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd" -"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" -"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/Cargo.toml b/Cargo.toml index dfb8db6..8529bdf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,6 @@ [workspace] members = [ "sub_crates/backend", - "sub_crates/text_encoding", ] [package] @@ -28,6 +27,3 @@ termion = "1.5" # Local crate dependencies [dependencies.backend] path = "sub_crates/backend" - -[dependencies.text_encoding] -path = "sub_crates/text_encoding" \ No newline at end of file diff --git a/sub_crates/text_encoding/Cargo.toml b/sub_crates/text_encoding/Cargo.toml deleted file mode 100644 index 41d8070..0000000 --- a/sub_crates/text_encoding/Cargo.toml +++ /dev/null @@ -1,13 +0,0 @@ -[package] -name = "text_encoding" -version = "0.1.0" -authors = ["Nathan Vegdahl "] -license = "MIT" -build = "build.rs" - -[lib] -name = "text_encoding" -path = "src/lib.rs" - -[dev-dependencies] -proptest = "0.8" \ No newline at end of file diff --git a/sub_crates/text_encoding/build.rs b/sub_crates/text_encoding/build.rs deleted file mode 100644 index fd611d8..0000000 --- a/sub_crates/text_encoding/build.rs +++ /dev/null @@ -1,221 +0,0 @@ -use std::env; -use std::fs::File; -use std::io::{BufRead, Read, Write}; -use std::path::Path; - -fn main() { - let out_dir = env::var("OUT_DIR").unwrap(); - - // Generate all of the single byte encoding tables and wrapper code. - { - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-ibm866.txt").unwrap(), - File::create(&Path::new(&out_dir).join("ibm866.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-iso-8859-2.txt").unwrap(), - File::create(&Path::new(&out_dir).join("iso-8859-2.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-iso-8859-3.txt").unwrap(), - File::create(&Path::new(&out_dir).join("iso-8859-3.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-iso-8859-4.txt").unwrap(), - File::create(&Path::new(&out_dir).join("iso-8859-4.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-iso-8859-5.txt").unwrap(), - File::create(&Path::new(&out_dir).join("iso-8859-5.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-iso-8859-6.txt").unwrap(), - File::create(&Path::new(&out_dir).join("iso-8859-6.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-iso-8859-7.txt").unwrap(), - File::create(&Path::new(&out_dir).join("iso-8859-7.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-iso-8859-8.txt").unwrap(), - File::create(&Path::new(&out_dir).join("iso-8859-8.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-iso-8859-10.txt").unwrap(), - File::create(&Path::new(&out_dir).join("iso-8859-10.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-iso-8859-13.txt").unwrap(), - File::create(&Path::new(&out_dir).join("iso-8859-13.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-iso-8859-14.txt").unwrap(), - File::create(&Path::new(&out_dir).join("iso-8859-14.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-iso-8859-15.txt").unwrap(), - File::create(&Path::new(&out_dir).join("iso-8859-15.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-iso-8859-16.txt").unwrap(), - File::create(&Path::new(&out_dir).join("iso-8859-16.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-koi8-r.txt").unwrap(), - File::create(&Path::new(&out_dir).join("koi8-r.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-koi8-u.txt").unwrap(), - File::create(&Path::new(&out_dir).join("koi8-u.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-macintosh.txt").unwrap(), - File::create(&Path::new(&out_dir).join("macintosh.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-windows-874.txt").unwrap(), - File::create(&Path::new(&out_dir).join("windows-874.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-windows-1250.txt").unwrap(), - File::create(&Path::new(&out_dir).join("windows-1250.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-windows-1251.txt").unwrap(), - File::create(&Path::new(&out_dir).join("windows-1251.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-windows-1252.txt").unwrap(), - File::create(&Path::new(&out_dir).join("windows-1252.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-windows-1253.txt").unwrap(), - File::create(&Path::new(&out_dir).join("windows-1253.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-windows-1254.txt").unwrap(), - File::create(&Path::new(&out_dir).join("windows-1254.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-windows-1255.txt").unwrap(), - File::create(&Path::new(&out_dir).join("windows-1255.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-windows-1256.txt").unwrap(), - File::create(&Path::new(&out_dir).join("windows-1256.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-windows-1257.txt").unwrap(), - File::create(&Path::new(&out_dir).join("windows-1257.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-windows-1258.txt").unwrap(), - File::create(&Path::new(&out_dir).join("windows-1258.rs")).unwrap(), - ).unwrap(); - generate_single_byte_encoding_from_index( - File::open("encoding_tables/index-x-mac-cyrillic.txt").unwrap(), - File::create(&Path::new(&out_dir).join("x-mac-cyrillic.rs")).unwrap(), - ).unwrap(); - } -} - -fn generate_single_byte_encoding_from_index( - in_file: R, - mut out_file: W, -) -> std::io::Result<()> { - let in_file = std::io::BufReader::new(in_file); - - // Collect the table. - let table = { - let mut table = ['�'; 128]; - for line in in_file.lines() { - let tmp = line.unwrap(); - let line = tmp.trim(); - if line.starts_with("#") || line == "" { - continue; - } - - let elements: Vec<_> = line.split_whitespace().collect(); - if elements.len() >= 2 { - let index = elements[0].parse::().unwrap(); - assert!(index <= 127); - let code = std::char::from_u32(u32::from_str_radix(&elements[1][2..], 16).unwrap()) - .unwrap(); - table[index] = code; - } - } - table - }; - - // Build the reverse table. - let rev_table = { - let mut rev_table = vec![]; - for (i, c) in table.iter().enumerate() { - rev_table.push((c, 128 + i)); - } - rev_table.sort_by_key(|x| x.0); - rev_table - }; - - // Write shared code. - out_file.write_all( - format!( - r#" -use {{DecodeResult, EncodeResult}}; - -pub fn encode_from_str<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> {{ - super::single_byte_encode_from_str(&ENCODE_TABLE, input, output) -}} - -pub fn decode_to_str<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> {{ - super::single_byte_decode_to_str(&DECODE_TABLE, input, output) -}} -"# - ).as_bytes(), - )?; - - // Write encode table. - out_file.write_all( - format!( - r#" -const ENCODE_TABLE: [(char, u8); {}] = [ -"#, - rev_table.len() - ).as_bytes(), - )?; - - for (c, i) in rev_table.iter() { - out_file.write_all(format!("('\\u{{{:04X}}}', 0x{:02X}), ", **c as u32, i).as_bytes())?; - } - - out_file.write_all( - format!( - r#" -]; -"# - ).as_bytes(), - )?; - - // Write decode table. - out_file.write_all( - format!( - r#" -const DECODE_TABLE: [char; 128] = [ -"# - ).as_bytes(), - )?; - - for c in table.iter() { - out_file.write_all(format!("'\\u{{{:04X}}}', ", *c as u32).as_bytes())?; - } - - out_file.write_all( - format!( - r#" -]; -"# - ).as_bytes(), - )?; - - Ok(()) -} diff --git a/sub_crates/text_encoding/encoding_tables/index-ibm866.txt b/sub_crates/text_encoding/encoding_tables/index-ibm866.txt deleted file mode 100644 index 6bbd9e3..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-ibm866.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-ibm866.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: db6fe14a559d1601a7667338d83704773d5708dbc641e1ad3c5e21405770f05e -# Date: 2018-01-06 - - 0 0x0410 А (CYRILLIC CAPITAL LETTER A) - 1 0x0411 Б (CYRILLIC CAPITAL LETTER BE) - 2 0x0412 В (CYRILLIC CAPITAL LETTER VE) - 3 0x0413 Г (CYRILLIC CAPITAL LETTER GHE) - 4 0x0414 Д (CYRILLIC CAPITAL LETTER DE) - 5 0x0415 Е (CYRILLIC CAPITAL LETTER IE) - 6 0x0416 Ж (CYRILLIC CAPITAL LETTER ZHE) - 7 0x0417 З (CYRILLIC CAPITAL LETTER ZE) - 8 0x0418 И (CYRILLIC CAPITAL LETTER I) - 9 0x0419 Й (CYRILLIC CAPITAL LETTER SHORT I) - 10 0x041A К (CYRILLIC CAPITAL LETTER KA) - 11 0x041B Л (CYRILLIC CAPITAL LETTER EL) - 12 0x041C М (CYRILLIC CAPITAL LETTER EM) - 13 0x041D Н (CYRILLIC CAPITAL LETTER EN) - 14 0x041E О (CYRILLIC CAPITAL LETTER O) - 15 0x041F П (CYRILLIC CAPITAL LETTER PE) - 16 0x0420 Р (CYRILLIC CAPITAL LETTER ER) - 17 0x0421 С (CYRILLIC CAPITAL LETTER ES) - 18 0x0422 Т (CYRILLIC CAPITAL LETTER TE) - 19 0x0423 У (CYRILLIC CAPITAL LETTER U) - 20 0x0424 Ф (CYRILLIC CAPITAL LETTER EF) - 21 0x0425 Х (CYRILLIC CAPITAL LETTER HA) - 22 0x0426 Ц (CYRILLIC CAPITAL LETTER TSE) - 23 0x0427 Ч (CYRILLIC CAPITAL LETTER CHE) - 24 0x0428 Ш (CYRILLIC CAPITAL LETTER SHA) - 25 0x0429 Щ (CYRILLIC CAPITAL LETTER SHCHA) - 26 0x042A Ъ (CYRILLIC CAPITAL LETTER HARD SIGN) - 27 0x042B Ы (CYRILLIC CAPITAL LETTER YERU) - 28 0x042C Ь (CYRILLIC CAPITAL LETTER SOFT SIGN) - 29 0x042D Э (CYRILLIC CAPITAL LETTER E) - 30 0x042E Ю (CYRILLIC CAPITAL LETTER YU) - 31 0x042F Я (CYRILLIC CAPITAL LETTER YA) - 32 0x0430 а (CYRILLIC SMALL LETTER A) - 33 0x0431 б (CYRILLIC SMALL LETTER BE) - 34 0x0432 в (CYRILLIC SMALL LETTER VE) - 35 0x0433 г (CYRILLIC SMALL LETTER GHE) - 36 0x0434 д (CYRILLIC SMALL LETTER DE) - 37 0x0435 е (CYRILLIC SMALL LETTER IE) - 38 0x0436 ж (CYRILLIC SMALL LETTER ZHE) - 39 0x0437 з (CYRILLIC SMALL LETTER ZE) - 40 0x0438 и (CYRILLIC SMALL LETTER I) - 41 0x0439 й (CYRILLIC SMALL LETTER SHORT I) - 42 0x043A к (CYRILLIC SMALL LETTER KA) - 43 0x043B л (CYRILLIC SMALL LETTER EL) - 44 0x043C м (CYRILLIC SMALL LETTER EM) - 45 0x043D н (CYRILLIC SMALL LETTER EN) - 46 0x043E о (CYRILLIC SMALL LETTER O) - 47 0x043F п (CYRILLIC SMALL LETTER PE) - 48 0x2591 ░ (LIGHT SHADE) - 49 0x2592 ▒ (MEDIUM SHADE) - 50 0x2593 ▓ (DARK SHADE) - 51 0x2502 │ (BOX DRAWINGS LIGHT VERTICAL) - 52 0x2524 ┤ (BOX DRAWINGS LIGHT VERTICAL AND LEFT) - 53 0x2561 ╡ (BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE) - 54 0x2562 ╢ (BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE) - 55 0x2556 ╖ (BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE) - 56 0x2555 ╕ (BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE) - 57 0x2563 ╣ (BOX DRAWINGS DOUBLE VERTICAL AND LEFT) - 58 0x2551 ║ (BOX DRAWINGS DOUBLE VERTICAL) - 59 0x2557 ╗ (BOX DRAWINGS DOUBLE DOWN AND LEFT) - 60 0x255D ╝ (BOX DRAWINGS DOUBLE UP AND LEFT) - 61 0x255C ╜ (BOX DRAWINGS UP DOUBLE AND LEFT SINGLE) - 62 0x255B ╛ (BOX DRAWINGS UP SINGLE AND LEFT DOUBLE) - 63 0x2510 ┐ (BOX DRAWINGS LIGHT DOWN AND LEFT) - 64 0x2514 └ (BOX DRAWINGS LIGHT UP AND RIGHT) - 65 0x2534 ┴ (BOX DRAWINGS LIGHT UP AND HORIZONTAL) - 66 0x252C ┬ (BOX DRAWINGS LIGHT DOWN AND HORIZONTAL) - 67 0x251C ├ (BOX DRAWINGS LIGHT VERTICAL AND RIGHT) - 68 0x2500 ─ (BOX DRAWINGS LIGHT HORIZONTAL) - 69 0x253C ┼ (BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL) - 70 0x255E ╞ (BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE) - 71 0x255F ╟ (BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE) - 72 0x255A ╚ (BOX DRAWINGS DOUBLE UP AND RIGHT) - 73 0x2554 ╔ (BOX DRAWINGS DOUBLE DOWN AND RIGHT) - 74 0x2569 ╩ (BOX DRAWINGS DOUBLE UP AND HORIZONTAL) - 75 0x2566 ╦ (BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL) - 76 0x2560 ╠ (BOX DRAWINGS DOUBLE VERTICAL AND RIGHT) - 77 0x2550 ═ (BOX DRAWINGS DOUBLE HORIZONTAL) - 78 0x256C ╬ (BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL) - 79 0x2567 ╧ (BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE) - 80 0x2568 ╨ (BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE) - 81 0x2564 ╤ (BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE) - 82 0x2565 ╥ (BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE) - 83 0x2559 ╙ (BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE) - 84 0x2558 ╘ (BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE) - 85 0x2552 ╒ (BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE) - 86 0x2553 ╓ (BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE) - 87 0x256B ╫ (BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE) - 88 0x256A ╪ (BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE) - 89 0x2518 ┘ (BOX DRAWINGS LIGHT UP AND LEFT) - 90 0x250C ┌ (BOX DRAWINGS LIGHT DOWN AND RIGHT) - 91 0x2588 █ (FULL BLOCK) - 92 0x2584 ▄ (LOWER HALF BLOCK) - 93 0x258C ▌ (LEFT HALF BLOCK) - 94 0x2590 ▐ (RIGHT HALF BLOCK) - 95 0x2580 ▀ (UPPER HALF BLOCK) - 96 0x0440 р (CYRILLIC SMALL LETTER ER) - 97 0x0441 с (CYRILLIC SMALL LETTER ES) - 98 0x0442 т (CYRILLIC SMALL LETTER TE) - 99 0x0443 у (CYRILLIC SMALL LETTER U) -100 0x0444 ф (CYRILLIC SMALL LETTER EF) -101 0x0445 х (CYRILLIC SMALL LETTER HA) -102 0x0446 ц (CYRILLIC SMALL LETTER TSE) -103 0x0447 ч (CYRILLIC SMALL LETTER CHE) -104 0x0448 ш (CYRILLIC SMALL LETTER SHA) -105 0x0449 щ (CYRILLIC SMALL LETTER SHCHA) -106 0x044A ъ (CYRILLIC SMALL LETTER HARD SIGN) -107 0x044B ы (CYRILLIC SMALL LETTER YERU) -108 0x044C ь (CYRILLIC SMALL LETTER SOFT SIGN) -109 0x044D э (CYRILLIC SMALL LETTER E) -110 0x044E ю (CYRILLIC SMALL LETTER YU) -111 0x044F я (CYRILLIC SMALL LETTER YA) -112 0x0401 Ё (CYRILLIC CAPITAL LETTER IO) -113 0x0451 ё (CYRILLIC SMALL LETTER IO) -114 0x0404 Є (CYRILLIC CAPITAL LETTER UKRAINIAN IE) -115 0x0454 є (CYRILLIC SMALL LETTER UKRAINIAN IE) -116 0x0407 Ї (CYRILLIC CAPITAL LETTER YI) -117 0x0457 ї (CYRILLIC SMALL LETTER YI) -118 0x040E Ў (CYRILLIC CAPITAL LETTER SHORT U) -119 0x045E ў (CYRILLIC SMALL LETTER SHORT U) -120 0x00B0 ° (DEGREE SIGN) -121 0x2219 ∙ (BULLET OPERATOR) -122 0x00B7 · (MIDDLE DOT) -123 0x221A √ (SQUARE ROOT) -124 0x2116 № (NUMERO SIGN) -125 0x00A4 ¤ (CURRENCY SIGN) -126 0x25A0 ■ (BLACK SQUARE) -127 0x00A0   (NO-BREAK SPACE) diff --git a/sub_crates/text_encoding/encoding_tables/index-iso-8859-10.txt b/sub_crates/text_encoding/encoding_tables/index-iso-8859-10.txt deleted file mode 100644 index 8386ba1..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-iso-8859-10.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-iso-8859-10.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: 02c2b5590d8ccda9931008c471f6ee2c590b2c8fe5e6ccb3b08638115d778507 -# Date: 2018-01-06 - - 0 0x0080 € () - 1 0x0081  () - 2 0x0082 ‚ () - 3 0x0083 ƒ () - 4 0x0084 „ () - 5 0x0085 … () - 6 0x0086 † () - 7 0x0087 ‡ () - 8 0x0088 ˆ () - 9 0x0089 ‰ () - 10 0x008A Š () - 11 0x008B ‹ () - 12 0x008C Œ () - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x0091 ‘ () - 18 0x0092 ’ () - 19 0x0093 “ () - 20 0x0094 ” () - 21 0x0095 • () - 22 0x0096 – () - 23 0x0097 — () - 24 0x0098 ˜ () - 25 0x0099 ™ () - 26 0x009A š () - 27 0x009B › () - 28 0x009C œ () - 29 0x009D  () - 30 0x009E ž () - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 33 0x0104 Ą (LATIN CAPITAL LETTER A WITH OGONEK) - 34 0x0112 Ē (LATIN CAPITAL LETTER E WITH MACRON) - 35 0x0122 Ģ (LATIN CAPITAL LETTER G WITH CEDILLA) - 36 0x012A Ī (LATIN CAPITAL LETTER I WITH MACRON) - 37 0x0128 Ĩ (LATIN CAPITAL LETTER I WITH TILDE) - 38 0x0136 Ķ (LATIN CAPITAL LETTER K WITH CEDILLA) - 39 0x00A7 § (SECTION SIGN) - 40 0x013B Ļ (LATIN CAPITAL LETTER L WITH CEDILLA) - 41 0x0110 Đ (LATIN CAPITAL LETTER D WITH STROKE) - 42 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) - 43 0x0166 Ŧ (LATIN CAPITAL LETTER T WITH STROKE) - 44 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x016A Ū (LATIN CAPITAL LETTER U WITH MACRON) - 47 0x014A Ŋ (LATIN CAPITAL LETTER ENG) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x0105 ą (LATIN SMALL LETTER A WITH OGONEK) - 50 0x0113 ē (LATIN SMALL LETTER E WITH MACRON) - 51 0x0123 ģ (LATIN SMALL LETTER G WITH CEDILLA) - 52 0x012B ī (LATIN SMALL LETTER I WITH MACRON) - 53 0x0129 ĩ (LATIN SMALL LETTER I WITH TILDE) - 54 0x0137 ķ (LATIN SMALL LETTER K WITH CEDILLA) - 55 0x00B7 · (MIDDLE DOT) - 56 0x013C ļ (LATIN SMALL LETTER L WITH CEDILLA) - 57 0x0111 đ (LATIN SMALL LETTER D WITH STROKE) - 58 0x0161 š (LATIN SMALL LETTER S WITH CARON) - 59 0x0167 ŧ (LATIN SMALL LETTER T WITH STROKE) - 60 0x017E ž (LATIN SMALL LETTER Z WITH CARON) - 61 0x2015 ― (HORIZONTAL BAR) - 62 0x016B ū (LATIN SMALL LETTER U WITH MACRON) - 63 0x014B ŋ (LATIN SMALL LETTER ENG) - 64 0x0100 Ā (LATIN CAPITAL LETTER A WITH MACRON) - 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) - 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) - 67 0x00C3 à (LATIN CAPITAL LETTER A WITH TILDE) - 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) - 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) - 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) - 71 0x012E Į (LATIN CAPITAL LETTER I WITH OGONEK) - 72 0x010C Č (LATIN CAPITAL LETTER C WITH CARON) - 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) - 74 0x0118 Ę (LATIN CAPITAL LETTER E WITH OGONEK) - 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) - 76 0x0116 Ė (LATIN CAPITAL LETTER E WITH DOT ABOVE) - 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) - 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) - 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) - 80 0x00D0 Ð (LATIN CAPITAL LETTER ETH) - 81 0x0145 Ņ (LATIN CAPITAL LETTER N WITH CEDILLA) - 82 0x014C Ō (LATIN CAPITAL LETTER O WITH MACRON) - 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) - 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) - 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) - 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) - 87 0x0168 Ũ (LATIN CAPITAL LETTER U WITH TILDE) - 88 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) - 89 0x0172 Ų (LATIN CAPITAL LETTER U WITH OGONEK) - 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) - 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) - 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) - 93 0x00DD Ý (LATIN CAPITAL LETTER Y WITH ACUTE) - 94 0x00DE Þ (LATIN CAPITAL LETTER THORN) - 95 0x00DF ß (LATIN SMALL LETTER SHARP S) - 96 0x0101 ā (LATIN SMALL LETTER A WITH MACRON) - 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) - 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) - 99 0x00E3 ã (LATIN SMALL LETTER A WITH TILDE) -100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) -101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) -102 0x00E6 æ (LATIN SMALL LETTER AE) -103 0x012F į (LATIN SMALL LETTER I WITH OGONEK) -104 0x010D č (LATIN SMALL LETTER C WITH CARON) -105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) -106 0x0119 ę (LATIN SMALL LETTER E WITH OGONEK) -107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) -108 0x0117 ė (LATIN SMALL LETTER E WITH DOT ABOVE) -109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) -110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) -111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) -112 0x00F0 ð (LATIN SMALL LETTER ETH) -113 0x0146 ņ (LATIN SMALL LETTER N WITH CEDILLA) -114 0x014D ō (LATIN SMALL LETTER O WITH MACRON) -115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) -116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) -117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) -118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) -119 0x0169 ũ (LATIN SMALL LETTER U WITH TILDE) -120 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) -121 0x0173 ų (LATIN SMALL LETTER U WITH OGONEK) -122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) -123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) -124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) -125 0x00FD ý (LATIN SMALL LETTER Y WITH ACUTE) -126 0x00FE þ (LATIN SMALL LETTER THORN) -127 0x0138 ĸ (LATIN SMALL LETTER KRA) diff --git a/sub_crates/text_encoding/encoding_tables/index-iso-8859-13.txt b/sub_crates/text_encoding/encoding_tables/index-iso-8859-13.txt deleted file mode 100644 index 031bb90..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-iso-8859-13.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-iso-8859-13.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: 40736338e964ab520407cebcb01329f8d450abf6ce12bf88b74b655b60e43300 -# Date: 2018-01-06 - - 0 0x0080 € () - 1 0x0081  () - 2 0x0082 ‚ () - 3 0x0083 ƒ () - 4 0x0084 „ () - 5 0x0085 … () - 6 0x0086 † () - 7 0x0087 ‡ () - 8 0x0088 ˆ () - 9 0x0089 ‰ () - 10 0x008A Š () - 11 0x008B ‹ () - 12 0x008C Œ () - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x0091 ‘ () - 18 0x0092 ’ () - 19 0x0093 “ () - 20 0x0094 ” () - 21 0x0095 • () - 22 0x0096 – () - 23 0x0097 — () - 24 0x0098 ˜ () - 25 0x0099 ™ () - 26 0x009A š () - 27 0x009B › () - 28 0x009C œ () - 29 0x009D  () - 30 0x009E ž () - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 33 0x201D ” (RIGHT DOUBLE QUOTATION MARK) - 34 0x00A2 ¢ (CENT SIGN) - 35 0x00A3 £ (POUND SIGN) - 36 0x00A4 ¤ (CURRENCY SIGN) - 37 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) - 38 0x00A6 ¦ (BROKEN BAR) - 39 0x00A7 § (SECTION SIGN) - 40 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x0156 Ŗ (LATIN CAPITAL LETTER R WITH CEDILLA) - 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 44 0x00AC ¬ (NOT SIGN) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x00AE ® (REGISTERED SIGN) - 47 0x00C6 Æ (LATIN CAPITAL LETTER AE) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x00B2 ² (SUPERSCRIPT TWO) - 51 0x00B3 ³ (SUPERSCRIPT THREE) - 52 0x201C “ (LEFT DOUBLE QUOTATION MARK) - 53 0x00B5 µ (MICRO SIGN) - 54 0x00B6 ¶ (PILCROW SIGN) - 55 0x00B7 · (MIDDLE DOT) - 56 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) - 57 0x00B9 ¹ (SUPERSCRIPT ONE) - 58 0x0157 ŗ (LATIN SMALL LETTER R WITH CEDILLA) - 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) - 61 0x00BD ½ (VULGAR FRACTION ONE HALF) - 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) - 63 0x00E6 æ (LATIN SMALL LETTER AE) - 64 0x0104 Ą (LATIN CAPITAL LETTER A WITH OGONEK) - 65 0x012E Į (LATIN CAPITAL LETTER I WITH OGONEK) - 66 0x0100 Ā (LATIN CAPITAL LETTER A WITH MACRON) - 67 0x0106 Ć (LATIN CAPITAL LETTER C WITH ACUTE) - 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) - 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) - 70 0x0118 Ę (LATIN CAPITAL LETTER E WITH OGONEK) - 71 0x0112 Ē (LATIN CAPITAL LETTER E WITH MACRON) - 72 0x010C Č (LATIN CAPITAL LETTER C WITH CARON) - 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) - 74 0x0179 Ź (LATIN CAPITAL LETTER Z WITH ACUTE) - 75 0x0116 Ė (LATIN CAPITAL LETTER E WITH DOT ABOVE) - 76 0x0122 Ģ (LATIN CAPITAL LETTER G WITH CEDILLA) - 77 0x0136 Ķ (LATIN CAPITAL LETTER K WITH CEDILLA) - 78 0x012A Ī (LATIN CAPITAL LETTER I WITH MACRON) - 79 0x013B Ļ (LATIN CAPITAL LETTER L WITH CEDILLA) - 80 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) - 81 0x0143 Ń (LATIN CAPITAL LETTER N WITH ACUTE) - 82 0x0145 Ņ (LATIN CAPITAL LETTER N WITH CEDILLA) - 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) - 84 0x014C Ō (LATIN CAPITAL LETTER O WITH MACRON) - 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) - 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) - 87 0x00D7 × (MULTIPLICATION SIGN) - 88 0x0172 Ų (LATIN CAPITAL LETTER U WITH OGONEK) - 89 0x0141 Ł (LATIN CAPITAL LETTER L WITH STROKE) - 90 0x015A Ś (LATIN CAPITAL LETTER S WITH ACUTE) - 91 0x016A Ū (LATIN CAPITAL LETTER U WITH MACRON) - 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) - 93 0x017B Ż (LATIN CAPITAL LETTER Z WITH DOT ABOVE) - 94 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) - 95 0x00DF ß (LATIN SMALL LETTER SHARP S) - 96 0x0105 ą (LATIN SMALL LETTER A WITH OGONEK) - 97 0x012F į (LATIN SMALL LETTER I WITH OGONEK) - 98 0x0101 ā (LATIN SMALL LETTER A WITH MACRON) - 99 0x0107 ć (LATIN SMALL LETTER C WITH ACUTE) -100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) -101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) -102 0x0119 ę (LATIN SMALL LETTER E WITH OGONEK) -103 0x0113 ē (LATIN SMALL LETTER E WITH MACRON) -104 0x010D č (LATIN SMALL LETTER C WITH CARON) -105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) -106 0x017A ź (LATIN SMALL LETTER Z WITH ACUTE) -107 0x0117 ė (LATIN SMALL LETTER E WITH DOT ABOVE) -108 0x0123 ģ (LATIN SMALL LETTER G WITH CEDILLA) -109 0x0137 ķ (LATIN SMALL LETTER K WITH CEDILLA) -110 0x012B ī (LATIN SMALL LETTER I WITH MACRON) -111 0x013C ļ (LATIN SMALL LETTER L WITH CEDILLA) -112 0x0161 š (LATIN SMALL LETTER S WITH CARON) -113 0x0144 ń (LATIN SMALL LETTER N WITH ACUTE) -114 0x0146 ņ (LATIN SMALL LETTER N WITH CEDILLA) -115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) -116 0x014D ō (LATIN SMALL LETTER O WITH MACRON) -117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) -118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) -119 0x00F7 ÷ (DIVISION SIGN) -120 0x0173 ų (LATIN SMALL LETTER U WITH OGONEK) -121 0x0142 ł (LATIN SMALL LETTER L WITH STROKE) -122 0x015B ś (LATIN SMALL LETTER S WITH ACUTE) -123 0x016B ū (LATIN SMALL LETTER U WITH MACRON) -124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) -125 0x017C ż (LATIN SMALL LETTER Z WITH DOT ABOVE) -126 0x017E ž (LATIN SMALL LETTER Z WITH CARON) -127 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) diff --git a/sub_crates/text_encoding/encoding_tables/index-iso-8859-14.txt b/sub_crates/text_encoding/encoding_tables/index-iso-8859-14.txt deleted file mode 100644 index 932fa55..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-iso-8859-14.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-iso-8859-14.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: 2c8651cfc08b1f35b17919ee5379f2fa006af3ec809f11b3b7f470785580542b -# Date: 2018-01-06 - - 0 0x0080 € () - 1 0x0081  () - 2 0x0082 ‚ () - 3 0x0083 ƒ () - 4 0x0084 „ () - 5 0x0085 … () - 6 0x0086 † () - 7 0x0087 ‡ () - 8 0x0088 ˆ () - 9 0x0089 ‰ () - 10 0x008A Š () - 11 0x008B ‹ () - 12 0x008C Œ () - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x0091 ‘ () - 18 0x0092 ’ () - 19 0x0093 “ () - 20 0x0094 ” () - 21 0x0095 • () - 22 0x0096 – () - 23 0x0097 — () - 24 0x0098 ˜ () - 25 0x0099 ™ () - 26 0x009A š () - 27 0x009B › () - 28 0x009C œ () - 29 0x009D  () - 30 0x009E ž () - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 33 0x1E02 Ḃ (LATIN CAPITAL LETTER B WITH DOT ABOVE) - 34 0x1E03 ḃ (LATIN SMALL LETTER B WITH DOT ABOVE) - 35 0x00A3 £ (POUND SIGN) - 36 0x010A Ċ (LATIN CAPITAL LETTER C WITH DOT ABOVE) - 37 0x010B ċ (LATIN SMALL LETTER C WITH DOT ABOVE) - 38 0x1E0A Ḋ (LATIN CAPITAL LETTER D WITH DOT ABOVE) - 39 0x00A7 § (SECTION SIGN) - 40 0x1E80 Ẁ (LATIN CAPITAL LETTER W WITH GRAVE) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x1E82 Ẃ (LATIN CAPITAL LETTER W WITH ACUTE) - 43 0x1E0B ḋ (LATIN SMALL LETTER D WITH DOT ABOVE) - 44 0x1EF2 Ỳ (LATIN CAPITAL LETTER Y WITH GRAVE) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x00AE ® (REGISTERED SIGN) - 47 0x0178 Ÿ (LATIN CAPITAL LETTER Y WITH DIAERESIS) - 48 0x1E1E Ḟ (LATIN CAPITAL LETTER F WITH DOT ABOVE) - 49 0x1E1F ḟ (LATIN SMALL LETTER F WITH DOT ABOVE) - 50 0x0120 Ġ (LATIN CAPITAL LETTER G WITH DOT ABOVE) - 51 0x0121 ġ (LATIN SMALL LETTER G WITH DOT ABOVE) - 52 0x1E40 Ṁ (LATIN CAPITAL LETTER M WITH DOT ABOVE) - 53 0x1E41 ṁ (LATIN SMALL LETTER M WITH DOT ABOVE) - 54 0x00B6 ¶ (PILCROW SIGN) - 55 0x1E56 Ṗ (LATIN CAPITAL LETTER P WITH DOT ABOVE) - 56 0x1E81 ẁ (LATIN SMALL LETTER W WITH GRAVE) - 57 0x1E57 ṗ (LATIN SMALL LETTER P WITH DOT ABOVE) - 58 0x1E83 ẃ (LATIN SMALL LETTER W WITH ACUTE) - 59 0x1E60 Ṡ (LATIN CAPITAL LETTER S WITH DOT ABOVE) - 60 0x1EF3 ỳ (LATIN SMALL LETTER Y WITH GRAVE) - 61 0x1E84 Ẅ (LATIN CAPITAL LETTER W WITH DIAERESIS) - 62 0x1E85 ẅ (LATIN SMALL LETTER W WITH DIAERESIS) - 63 0x1E61 ṡ (LATIN SMALL LETTER S WITH DOT ABOVE) - 64 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) - 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) - 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) - 67 0x00C3 à (LATIN CAPITAL LETTER A WITH TILDE) - 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) - 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) - 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) - 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) - 72 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) - 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) - 74 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) - 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) - 76 0x00CC Ì (LATIN CAPITAL LETTER I WITH GRAVE) - 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) - 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) - 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) - 80 0x0174 Ŵ (LATIN CAPITAL LETTER W WITH CIRCUMFLEX) - 81 0x00D1 Ñ (LATIN CAPITAL LETTER N WITH TILDE) - 82 0x00D2 Ò (LATIN CAPITAL LETTER O WITH GRAVE) - 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) - 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) - 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) - 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) - 87 0x1E6A Ṫ (LATIN CAPITAL LETTER T WITH DOT ABOVE) - 88 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) - 89 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) - 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) - 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) - 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) - 93 0x00DD Ý (LATIN CAPITAL LETTER Y WITH ACUTE) - 94 0x0176 Ŷ (LATIN CAPITAL LETTER Y WITH CIRCUMFLEX) - 95 0x00DF ß (LATIN SMALL LETTER SHARP S) - 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) - 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) - 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) - 99 0x00E3 ã (LATIN SMALL LETTER A WITH TILDE) -100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) -101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) -102 0x00E6 æ (LATIN SMALL LETTER AE) -103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) -104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) -105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) -106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) -107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) -108 0x00EC ì (LATIN SMALL LETTER I WITH GRAVE) -109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) -110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) -111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) -112 0x0175 ŵ (LATIN SMALL LETTER W WITH CIRCUMFLEX) -113 0x00F1 ñ (LATIN SMALL LETTER N WITH TILDE) -114 0x00F2 ò (LATIN SMALL LETTER O WITH GRAVE) -115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) -116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) -117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) -118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) -119 0x1E6B ṫ (LATIN SMALL LETTER T WITH DOT ABOVE) -120 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) -121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) -122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) -123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) -124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) -125 0x00FD ý (LATIN SMALL LETTER Y WITH ACUTE) -126 0x0177 ŷ (LATIN SMALL LETTER Y WITH CIRCUMFLEX) -127 0x00FF ÿ (LATIN SMALL LETTER Y WITH DIAERESIS) diff --git a/sub_crates/text_encoding/encoding_tables/index-iso-8859-15.txt b/sub_crates/text_encoding/encoding_tables/index-iso-8859-15.txt deleted file mode 100644 index 65961d9..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-iso-8859-15.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-iso-8859-15.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: a560aba47bccd7510a6ac77f671fe75dca3800f05cf6d676910c311a8f8ff079 -# Date: 2018-01-06 - - 0 0x0080 € () - 1 0x0081  () - 2 0x0082 ‚ () - 3 0x0083 ƒ () - 4 0x0084 „ () - 5 0x0085 … () - 6 0x0086 † () - 7 0x0087 ‡ () - 8 0x0088 ˆ () - 9 0x0089 ‰ () - 10 0x008A Š () - 11 0x008B ‹ () - 12 0x008C Œ () - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x0091 ‘ () - 18 0x0092 ’ () - 19 0x0093 “ () - 20 0x0094 ” () - 21 0x0095 • () - 22 0x0096 – () - 23 0x0097 — () - 24 0x0098 ˜ () - 25 0x0099 ™ () - 26 0x009A š () - 27 0x009B › () - 28 0x009C œ () - 29 0x009D  () - 30 0x009E ž () - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 33 0x00A1 ¡ (INVERTED EXCLAMATION MARK) - 34 0x00A2 ¢ (CENT SIGN) - 35 0x00A3 £ (POUND SIGN) - 36 0x20AC € (EURO SIGN) - 37 0x00A5 ¥ (YEN SIGN) - 38 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) - 39 0x00A7 § (SECTION SIGN) - 40 0x0161 š (LATIN SMALL LETTER S WITH CARON) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x00AA ª (FEMININE ORDINAL INDICATOR) - 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 44 0x00AC ¬ (NOT SIGN) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x00AE ® (REGISTERED SIGN) - 47 0x00AF ¯ (MACRON) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x00B2 ² (SUPERSCRIPT TWO) - 51 0x00B3 ³ (SUPERSCRIPT THREE) - 52 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) - 53 0x00B5 µ (MICRO SIGN) - 54 0x00B6 ¶ (PILCROW SIGN) - 55 0x00B7 · (MIDDLE DOT) - 56 0x017E ž (LATIN SMALL LETTER Z WITH CARON) - 57 0x00B9 ¹ (SUPERSCRIPT ONE) - 58 0x00BA º (MASCULINE ORDINAL INDICATOR) - 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 60 0x0152 Œ (LATIN CAPITAL LIGATURE OE) - 61 0x0153 œ (LATIN SMALL LIGATURE OE) - 62 0x0178 Ÿ (LATIN CAPITAL LETTER Y WITH DIAERESIS) - 63 0x00BF ¿ (INVERTED QUESTION MARK) - 64 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) - 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) - 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) - 67 0x00C3 à (LATIN CAPITAL LETTER A WITH TILDE) - 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) - 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) - 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) - 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) - 72 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) - 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) - 74 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) - 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) - 76 0x00CC Ì (LATIN CAPITAL LETTER I WITH GRAVE) - 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) - 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) - 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) - 80 0x00D0 Ð (LATIN CAPITAL LETTER ETH) - 81 0x00D1 Ñ (LATIN CAPITAL LETTER N WITH TILDE) - 82 0x00D2 Ò (LATIN CAPITAL LETTER O WITH GRAVE) - 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) - 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) - 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) - 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) - 87 0x00D7 × (MULTIPLICATION SIGN) - 88 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) - 89 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) - 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) - 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) - 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) - 93 0x00DD Ý (LATIN CAPITAL LETTER Y WITH ACUTE) - 94 0x00DE Þ (LATIN CAPITAL LETTER THORN) - 95 0x00DF ß (LATIN SMALL LETTER SHARP S) - 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) - 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) - 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) - 99 0x00E3 ã (LATIN SMALL LETTER A WITH TILDE) -100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) -101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) -102 0x00E6 æ (LATIN SMALL LETTER AE) -103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) -104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) -105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) -106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) -107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) -108 0x00EC ì (LATIN SMALL LETTER I WITH GRAVE) -109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) -110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) -111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) -112 0x00F0 ð (LATIN SMALL LETTER ETH) -113 0x00F1 ñ (LATIN SMALL LETTER N WITH TILDE) -114 0x00F2 ò (LATIN SMALL LETTER O WITH GRAVE) -115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) -116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) -117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) -118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) -119 0x00F7 ÷ (DIVISION SIGN) -120 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) -121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) -122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) -123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) -124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) -125 0x00FD ý (LATIN SMALL LETTER Y WITH ACUTE) -126 0x00FE þ (LATIN SMALL LETTER THORN) -127 0x00FF ÿ (LATIN SMALL LETTER Y WITH DIAERESIS) diff --git a/sub_crates/text_encoding/encoding_tables/index-iso-8859-16.txt b/sub_crates/text_encoding/encoding_tables/index-iso-8859-16.txt deleted file mode 100644 index 16e416f..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-iso-8859-16.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-iso-8859-16.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: 55676320d2d1b6e6909f5b3d741a7cf0cefc84e920aa4474afc091459111c2e3 -# Date: 2018-01-06 - - 0 0x0080 € () - 1 0x0081  () - 2 0x0082 ‚ () - 3 0x0083 ƒ () - 4 0x0084 „ () - 5 0x0085 … () - 6 0x0086 † () - 7 0x0087 ‡ () - 8 0x0088 ˆ () - 9 0x0089 ‰ () - 10 0x008A Š () - 11 0x008B ‹ () - 12 0x008C Œ () - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x0091 ‘ () - 18 0x0092 ’ () - 19 0x0093 “ () - 20 0x0094 ” () - 21 0x0095 • () - 22 0x0096 – () - 23 0x0097 — () - 24 0x0098 ˜ () - 25 0x0099 ™ () - 26 0x009A š () - 27 0x009B › () - 28 0x009C œ () - 29 0x009D  () - 30 0x009E ž () - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 33 0x0104 Ą (LATIN CAPITAL LETTER A WITH OGONEK) - 34 0x0105 ą (LATIN SMALL LETTER A WITH OGONEK) - 35 0x0141 Ł (LATIN CAPITAL LETTER L WITH STROKE) - 36 0x20AC € (EURO SIGN) - 37 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) - 38 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) - 39 0x00A7 § (SECTION SIGN) - 40 0x0161 š (LATIN SMALL LETTER S WITH CARON) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x0218 Ș (LATIN CAPITAL LETTER S WITH COMMA BELOW) - 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 44 0x0179 Ź (LATIN CAPITAL LETTER Z WITH ACUTE) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x017A ź (LATIN SMALL LETTER Z WITH ACUTE) - 47 0x017B Ż (LATIN CAPITAL LETTER Z WITH DOT ABOVE) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x010C Č (LATIN CAPITAL LETTER C WITH CARON) - 51 0x0142 ł (LATIN SMALL LETTER L WITH STROKE) - 52 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) - 53 0x201D ” (RIGHT DOUBLE QUOTATION MARK) - 54 0x00B6 ¶ (PILCROW SIGN) - 55 0x00B7 · (MIDDLE DOT) - 56 0x017E ž (LATIN SMALL LETTER Z WITH CARON) - 57 0x010D č (LATIN SMALL LETTER C WITH CARON) - 58 0x0219 ș (LATIN SMALL LETTER S WITH COMMA BELOW) - 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 60 0x0152 Œ (LATIN CAPITAL LIGATURE OE) - 61 0x0153 œ (LATIN SMALL LIGATURE OE) - 62 0x0178 Ÿ (LATIN CAPITAL LETTER Y WITH DIAERESIS) - 63 0x017C ż (LATIN SMALL LETTER Z WITH DOT ABOVE) - 64 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) - 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) - 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) - 67 0x0102 Ă (LATIN CAPITAL LETTER A WITH BREVE) - 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) - 69 0x0106 Ć (LATIN CAPITAL LETTER C WITH ACUTE) - 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) - 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) - 72 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) - 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) - 74 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) - 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) - 76 0x00CC Ì (LATIN CAPITAL LETTER I WITH GRAVE) - 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) - 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) - 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) - 80 0x0110 Đ (LATIN CAPITAL LETTER D WITH STROKE) - 81 0x0143 Ń (LATIN CAPITAL LETTER N WITH ACUTE) - 82 0x00D2 Ò (LATIN CAPITAL LETTER O WITH GRAVE) - 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) - 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) - 85 0x0150 Ő (LATIN CAPITAL LETTER O WITH DOUBLE ACUTE) - 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) - 87 0x015A Ś (LATIN CAPITAL LETTER S WITH ACUTE) - 88 0x0170 Ű (LATIN CAPITAL LETTER U WITH DOUBLE ACUTE) - 89 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) - 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) - 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) - 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) - 93 0x0118 Ę (LATIN CAPITAL LETTER E WITH OGONEK) - 94 0x021A Ț (LATIN CAPITAL LETTER T WITH COMMA BELOW) - 95 0x00DF ß (LATIN SMALL LETTER SHARP S) - 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) - 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) - 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) - 99 0x0103 ă (LATIN SMALL LETTER A WITH BREVE) -100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) -101 0x0107 ć (LATIN SMALL LETTER C WITH ACUTE) -102 0x00E6 æ (LATIN SMALL LETTER AE) -103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) -104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) -105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) -106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) -107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) -108 0x00EC ì (LATIN SMALL LETTER I WITH GRAVE) -109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) -110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) -111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) -112 0x0111 đ (LATIN SMALL LETTER D WITH STROKE) -113 0x0144 ń (LATIN SMALL LETTER N WITH ACUTE) -114 0x00F2 ò (LATIN SMALL LETTER O WITH GRAVE) -115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) -116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) -117 0x0151 ő (LATIN SMALL LETTER O WITH DOUBLE ACUTE) -118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) -119 0x015B ś (LATIN SMALL LETTER S WITH ACUTE) -120 0x0171 ű (LATIN SMALL LETTER U WITH DOUBLE ACUTE) -121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) -122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) -123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) -124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) -125 0x0119 ę (LATIN SMALL LETTER E WITH OGONEK) -126 0x021B ț (LATIN SMALL LETTER T WITH COMMA BELOW) -127 0x00FF ÿ (LATIN SMALL LETTER Y WITH DIAERESIS) diff --git a/sub_crates/text_encoding/encoding_tables/index-iso-8859-2.txt b/sub_crates/text_encoding/encoding_tables/index-iso-8859-2.txt deleted file mode 100644 index b0e14de..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-iso-8859-2.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-iso-8859-2.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: 9569c67f22d0b57790e1c407c6eecf227e4562322dc296de43cdab7a0152ec73 -# Date: 2018-01-06 - - 0 0x0080 € () - 1 0x0081  () - 2 0x0082 ‚ () - 3 0x0083 ƒ () - 4 0x0084 „ () - 5 0x0085 … () - 6 0x0086 † () - 7 0x0087 ‡ () - 8 0x0088 ˆ () - 9 0x0089 ‰ () - 10 0x008A Š () - 11 0x008B ‹ () - 12 0x008C Œ () - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x0091 ‘ () - 18 0x0092 ’ () - 19 0x0093 “ () - 20 0x0094 ” () - 21 0x0095 • () - 22 0x0096 – () - 23 0x0097 — () - 24 0x0098 ˜ () - 25 0x0099 ™ () - 26 0x009A š () - 27 0x009B › () - 28 0x009C œ () - 29 0x009D  () - 30 0x009E ž () - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 33 0x0104 Ą (LATIN CAPITAL LETTER A WITH OGONEK) - 34 0x02D8 ˘ (BREVE) - 35 0x0141 Ł (LATIN CAPITAL LETTER L WITH STROKE) - 36 0x00A4 ¤ (CURRENCY SIGN) - 37 0x013D Ľ (LATIN CAPITAL LETTER L WITH CARON) - 38 0x015A Ś (LATIN CAPITAL LETTER S WITH ACUTE) - 39 0x00A7 § (SECTION SIGN) - 40 0x00A8 ¨ (DIAERESIS) - 41 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) - 42 0x015E Ş (LATIN CAPITAL LETTER S WITH CEDILLA) - 43 0x0164 Ť (LATIN CAPITAL LETTER T WITH CARON) - 44 0x0179 Ź (LATIN CAPITAL LETTER Z WITH ACUTE) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) - 47 0x017B Ż (LATIN CAPITAL LETTER Z WITH DOT ABOVE) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x0105 ą (LATIN SMALL LETTER A WITH OGONEK) - 50 0x02DB ˛ (OGONEK) - 51 0x0142 ł (LATIN SMALL LETTER L WITH STROKE) - 52 0x00B4 ´ (ACUTE ACCENT) - 53 0x013E ľ (LATIN SMALL LETTER L WITH CARON) - 54 0x015B ś (LATIN SMALL LETTER S WITH ACUTE) - 55 0x02C7 ˇ (CARON) - 56 0x00B8 ¸ (CEDILLA) - 57 0x0161 š (LATIN SMALL LETTER S WITH CARON) - 58 0x015F ş (LATIN SMALL LETTER S WITH CEDILLA) - 59 0x0165 ť (LATIN SMALL LETTER T WITH CARON) - 60 0x017A ź (LATIN SMALL LETTER Z WITH ACUTE) - 61 0x02DD ˝ (DOUBLE ACUTE ACCENT) - 62 0x017E ž (LATIN SMALL LETTER Z WITH CARON) - 63 0x017C ż (LATIN SMALL LETTER Z WITH DOT ABOVE) - 64 0x0154 Ŕ (LATIN CAPITAL LETTER R WITH ACUTE) - 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) - 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) - 67 0x0102 Ă (LATIN CAPITAL LETTER A WITH BREVE) - 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) - 69 0x0139 Ĺ (LATIN CAPITAL LETTER L WITH ACUTE) - 70 0x0106 Ć (LATIN CAPITAL LETTER C WITH ACUTE) - 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) - 72 0x010C Č (LATIN CAPITAL LETTER C WITH CARON) - 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) - 74 0x0118 Ę (LATIN CAPITAL LETTER E WITH OGONEK) - 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) - 76 0x011A Ě (LATIN CAPITAL LETTER E WITH CARON) - 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) - 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) - 79 0x010E Ď (LATIN CAPITAL LETTER D WITH CARON) - 80 0x0110 Đ (LATIN CAPITAL LETTER D WITH STROKE) - 81 0x0143 Ń (LATIN CAPITAL LETTER N WITH ACUTE) - 82 0x0147 Ň (LATIN CAPITAL LETTER N WITH CARON) - 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) - 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) - 85 0x0150 Ő (LATIN CAPITAL LETTER O WITH DOUBLE ACUTE) - 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) - 87 0x00D7 × (MULTIPLICATION SIGN) - 88 0x0158 Ř (LATIN CAPITAL LETTER R WITH CARON) - 89 0x016E Ů (LATIN CAPITAL LETTER U WITH RING ABOVE) - 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) - 91 0x0170 Ű (LATIN CAPITAL LETTER U WITH DOUBLE ACUTE) - 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) - 93 0x00DD Ý (LATIN CAPITAL LETTER Y WITH ACUTE) - 94 0x0162 Ţ (LATIN CAPITAL LETTER T WITH CEDILLA) - 95 0x00DF ß (LATIN SMALL LETTER SHARP S) - 96 0x0155 ŕ (LATIN SMALL LETTER R WITH ACUTE) - 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) - 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) - 99 0x0103 ă (LATIN SMALL LETTER A WITH BREVE) -100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) -101 0x013A ĺ (LATIN SMALL LETTER L WITH ACUTE) -102 0x0107 ć (LATIN SMALL LETTER C WITH ACUTE) -103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) -104 0x010D č (LATIN SMALL LETTER C WITH CARON) -105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) -106 0x0119 ę (LATIN SMALL LETTER E WITH OGONEK) -107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) -108 0x011B ě (LATIN SMALL LETTER E WITH CARON) -109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) -110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) -111 0x010F ď (LATIN SMALL LETTER D WITH CARON) -112 0x0111 đ (LATIN SMALL LETTER D WITH STROKE) -113 0x0144 ń (LATIN SMALL LETTER N WITH ACUTE) -114 0x0148 ň (LATIN SMALL LETTER N WITH CARON) -115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) -116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) -117 0x0151 ő (LATIN SMALL LETTER O WITH DOUBLE ACUTE) -118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) -119 0x00F7 ÷ (DIVISION SIGN) -120 0x0159 ř (LATIN SMALL LETTER R WITH CARON) -121 0x016F ů (LATIN SMALL LETTER U WITH RING ABOVE) -122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) -123 0x0171 ű (LATIN SMALL LETTER U WITH DOUBLE ACUTE) -124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) -125 0x00FD ý (LATIN SMALL LETTER Y WITH ACUTE) -126 0x0163 ţ (LATIN SMALL LETTER T WITH CEDILLA) -127 0x02D9 ˙ (DOT ABOVE) diff --git a/sub_crates/text_encoding/encoding_tables/index-iso-8859-3.txt b/sub_crates/text_encoding/encoding_tables/index-iso-8859-3.txt deleted file mode 100644 index 018861a..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-iso-8859-3.txt +++ /dev/null @@ -1,127 +0,0 @@ -# For details on index index-iso-8859-3.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: af8f1e12df79b768322b5e83613698cdc619438270a2fc359554331c805054a3 -# Date: 2018-01-06 - - 0 0x0080 € () - 1 0x0081  () - 2 0x0082 ‚ () - 3 0x0083 ƒ () - 4 0x0084 „ () - 5 0x0085 … () - 6 0x0086 † () - 7 0x0087 ‡ () - 8 0x0088 ˆ () - 9 0x0089 ‰ () - 10 0x008A Š () - 11 0x008B ‹ () - 12 0x008C Œ () - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x0091 ‘ () - 18 0x0092 ’ () - 19 0x0093 “ () - 20 0x0094 ” () - 21 0x0095 • () - 22 0x0096 – () - 23 0x0097 — () - 24 0x0098 ˜ () - 25 0x0099 ™ () - 26 0x009A š () - 27 0x009B › () - 28 0x009C œ () - 29 0x009D  () - 30 0x009E ž () - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 33 0x0126 Ħ (LATIN CAPITAL LETTER H WITH STROKE) - 34 0x02D8 ˘ (BREVE) - 35 0x00A3 £ (POUND SIGN) - 36 0x00A4 ¤ (CURRENCY SIGN) - 38 0x0124 Ĥ (LATIN CAPITAL LETTER H WITH CIRCUMFLEX) - 39 0x00A7 § (SECTION SIGN) - 40 0x00A8 ¨ (DIAERESIS) - 41 0x0130 İ (LATIN CAPITAL LETTER I WITH DOT ABOVE) - 42 0x015E Ş (LATIN CAPITAL LETTER S WITH CEDILLA) - 43 0x011E Ğ (LATIN CAPITAL LETTER G WITH BREVE) - 44 0x0134 Ĵ (LATIN CAPITAL LETTER J WITH CIRCUMFLEX) - 45 0x00AD ­ (SOFT HYPHEN) - 47 0x017B Ż (LATIN CAPITAL LETTER Z WITH DOT ABOVE) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x0127 ħ (LATIN SMALL LETTER H WITH STROKE) - 50 0x00B2 ² (SUPERSCRIPT TWO) - 51 0x00B3 ³ (SUPERSCRIPT THREE) - 52 0x00B4 ´ (ACUTE ACCENT) - 53 0x00B5 µ (MICRO SIGN) - 54 0x0125 ĥ (LATIN SMALL LETTER H WITH CIRCUMFLEX) - 55 0x00B7 · (MIDDLE DOT) - 56 0x00B8 ¸ (CEDILLA) - 57 0x0131 ı (LATIN SMALL LETTER DOTLESS I) - 58 0x015F ş (LATIN SMALL LETTER S WITH CEDILLA) - 59 0x011F ğ (LATIN SMALL LETTER G WITH BREVE) - 60 0x0135 ĵ (LATIN SMALL LETTER J WITH CIRCUMFLEX) - 61 0x00BD ½ (VULGAR FRACTION ONE HALF) - 63 0x017C ż (LATIN SMALL LETTER Z WITH DOT ABOVE) - 64 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) - 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) - 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) - 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) - 69 0x010A Ċ (LATIN CAPITAL LETTER C WITH DOT ABOVE) - 70 0x0108 Ĉ (LATIN CAPITAL LETTER C WITH CIRCUMFLEX) - 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) - 72 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) - 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) - 74 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) - 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) - 76 0x00CC Ì (LATIN CAPITAL LETTER I WITH GRAVE) - 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) - 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) - 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) - 81 0x00D1 Ñ (LATIN CAPITAL LETTER N WITH TILDE) - 82 0x00D2 Ò (LATIN CAPITAL LETTER O WITH GRAVE) - 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) - 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) - 85 0x0120 Ġ (LATIN CAPITAL LETTER G WITH DOT ABOVE) - 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) - 87 0x00D7 × (MULTIPLICATION SIGN) - 88 0x011C Ĝ (LATIN CAPITAL LETTER G WITH CIRCUMFLEX) - 89 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) - 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) - 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) - 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) - 93 0x016C Ŭ (LATIN CAPITAL LETTER U WITH BREVE) - 94 0x015C Ŝ (LATIN CAPITAL LETTER S WITH CIRCUMFLEX) - 95 0x00DF ß (LATIN SMALL LETTER SHARP S) - 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) - 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) - 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) -100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) -101 0x010B ċ (LATIN SMALL LETTER C WITH DOT ABOVE) -102 0x0109 ĉ (LATIN SMALL LETTER C WITH CIRCUMFLEX) -103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) -104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) -105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) -106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) -107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) -108 0x00EC ì (LATIN SMALL LETTER I WITH GRAVE) -109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) -110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) -111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) -113 0x00F1 ñ (LATIN SMALL LETTER N WITH TILDE) -114 0x00F2 ò (LATIN SMALL LETTER O WITH GRAVE) -115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) -116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) -117 0x0121 ġ (LATIN SMALL LETTER G WITH DOT ABOVE) -118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) -119 0x00F7 ÷ (DIVISION SIGN) -120 0x011D ĝ (LATIN SMALL LETTER G WITH CIRCUMFLEX) -121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) -122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) -123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) -124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) -125 0x016D ŭ (LATIN SMALL LETTER U WITH BREVE) -126 0x015D ŝ (LATIN SMALL LETTER S WITH CIRCUMFLEX) -127 0x02D9 ˙ (DOT ABOVE) diff --git a/sub_crates/text_encoding/encoding_tables/index-iso-8859-4.txt b/sub_crates/text_encoding/encoding_tables/index-iso-8859-4.txt deleted file mode 100644 index a268878..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-iso-8859-4.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-iso-8859-4.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: 72f29c92344d351fe9e74a946e7e0468d76d542c6894ff82982cb652ebe0feb7 -# Date: 2018-01-06 - - 0 0x0080 € () - 1 0x0081  () - 2 0x0082 ‚ () - 3 0x0083 ƒ () - 4 0x0084 „ () - 5 0x0085 … () - 6 0x0086 † () - 7 0x0087 ‡ () - 8 0x0088 ˆ () - 9 0x0089 ‰ () - 10 0x008A Š () - 11 0x008B ‹ () - 12 0x008C Œ () - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x0091 ‘ () - 18 0x0092 ’ () - 19 0x0093 “ () - 20 0x0094 ” () - 21 0x0095 • () - 22 0x0096 – () - 23 0x0097 — () - 24 0x0098 ˜ () - 25 0x0099 ™ () - 26 0x009A š () - 27 0x009B › () - 28 0x009C œ () - 29 0x009D  () - 30 0x009E ž () - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 33 0x0104 Ą (LATIN CAPITAL LETTER A WITH OGONEK) - 34 0x0138 ĸ (LATIN SMALL LETTER KRA) - 35 0x0156 Ŗ (LATIN CAPITAL LETTER R WITH CEDILLA) - 36 0x00A4 ¤ (CURRENCY SIGN) - 37 0x0128 Ĩ (LATIN CAPITAL LETTER I WITH TILDE) - 38 0x013B Ļ (LATIN CAPITAL LETTER L WITH CEDILLA) - 39 0x00A7 § (SECTION SIGN) - 40 0x00A8 ¨ (DIAERESIS) - 41 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) - 42 0x0112 Ē (LATIN CAPITAL LETTER E WITH MACRON) - 43 0x0122 Ģ (LATIN CAPITAL LETTER G WITH CEDILLA) - 44 0x0166 Ŧ (LATIN CAPITAL LETTER T WITH STROKE) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) - 47 0x00AF ¯ (MACRON) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x0105 ą (LATIN SMALL LETTER A WITH OGONEK) - 50 0x02DB ˛ (OGONEK) - 51 0x0157 ŗ (LATIN SMALL LETTER R WITH CEDILLA) - 52 0x00B4 ´ (ACUTE ACCENT) - 53 0x0129 ĩ (LATIN SMALL LETTER I WITH TILDE) - 54 0x013C ļ (LATIN SMALL LETTER L WITH CEDILLA) - 55 0x02C7 ˇ (CARON) - 56 0x00B8 ¸ (CEDILLA) - 57 0x0161 š (LATIN SMALL LETTER S WITH CARON) - 58 0x0113 ē (LATIN SMALL LETTER E WITH MACRON) - 59 0x0123 ģ (LATIN SMALL LETTER G WITH CEDILLA) - 60 0x0167 ŧ (LATIN SMALL LETTER T WITH STROKE) - 61 0x014A Ŋ (LATIN CAPITAL LETTER ENG) - 62 0x017E ž (LATIN SMALL LETTER Z WITH CARON) - 63 0x014B ŋ (LATIN SMALL LETTER ENG) - 64 0x0100 Ā (LATIN CAPITAL LETTER A WITH MACRON) - 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) - 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) - 67 0x00C3 à (LATIN CAPITAL LETTER A WITH TILDE) - 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) - 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) - 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) - 71 0x012E Į (LATIN CAPITAL LETTER I WITH OGONEK) - 72 0x010C Č (LATIN CAPITAL LETTER C WITH CARON) - 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) - 74 0x0118 Ę (LATIN CAPITAL LETTER E WITH OGONEK) - 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) - 76 0x0116 Ė (LATIN CAPITAL LETTER E WITH DOT ABOVE) - 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) - 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) - 79 0x012A Ī (LATIN CAPITAL LETTER I WITH MACRON) - 80 0x0110 Đ (LATIN CAPITAL LETTER D WITH STROKE) - 81 0x0145 Ņ (LATIN CAPITAL LETTER N WITH CEDILLA) - 82 0x014C Ō (LATIN CAPITAL LETTER O WITH MACRON) - 83 0x0136 Ķ (LATIN CAPITAL LETTER K WITH CEDILLA) - 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) - 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) - 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) - 87 0x00D7 × (MULTIPLICATION SIGN) - 88 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) - 89 0x0172 Ų (LATIN CAPITAL LETTER U WITH OGONEK) - 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) - 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) - 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) - 93 0x0168 Ũ (LATIN CAPITAL LETTER U WITH TILDE) - 94 0x016A Ū (LATIN CAPITAL LETTER U WITH MACRON) - 95 0x00DF ß (LATIN SMALL LETTER SHARP S) - 96 0x0101 ā (LATIN SMALL LETTER A WITH MACRON) - 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) - 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) - 99 0x00E3 ã (LATIN SMALL LETTER A WITH TILDE) -100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) -101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) -102 0x00E6 æ (LATIN SMALL LETTER AE) -103 0x012F į (LATIN SMALL LETTER I WITH OGONEK) -104 0x010D č (LATIN SMALL LETTER C WITH CARON) -105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) -106 0x0119 ę (LATIN SMALL LETTER E WITH OGONEK) -107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) -108 0x0117 ė (LATIN SMALL LETTER E WITH DOT ABOVE) -109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) -110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) -111 0x012B ī (LATIN SMALL LETTER I WITH MACRON) -112 0x0111 đ (LATIN SMALL LETTER D WITH STROKE) -113 0x0146 ņ (LATIN SMALL LETTER N WITH CEDILLA) -114 0x014D ō (LATIN SMALL LETTER O WITH MACRON) -115 0x0137 ķ (LATIN SMALL LETTER K WITH CEDILLA) -116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) -117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) -118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) -119 0x00F7 ÷ (DIVISION SIGN) -120 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) -121 0x0173 ų (LATIN SMALL LETTER U WITH OGONEK) -122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) -123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) -124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) -125 0x0169 ũ (LATIN SMALL LETTER U WITH TILDE) -126 0x016B ū (LATIN SMALL LETTER U WITH MACRON) -127 0x02D9 ˙ (DOT ABOVE) diff --git a/sub_crates/text_encoding/encoding_tables/index-iso-8859-5.txt b/sub_crates/text_encoding/encoding_tables/index-iso-8859-5.txt deleted file mode 100644 index f5e2962..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-iso-8859-5.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-iso-8859-5.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: fa9b1f3f5242df43e2e7bca80e9b6997c67944f20a4af91ee06bacc4e132d9c9 -# Date: 2018-01-06 - - 0 0x0080 € () - 1 0x0081  () - 2 0x0082 ‚ () - 3 0x0083 ƒ () - 4 0x0084 „ () - 5 0x0085 … () - 6 0x0086 † () - 7 0x0087 ‡ () - 8 0x0088 ˆ () - 9 0x0089 ‰ () - 10 0x008A Š () - 11 0x008B ‹ () - 12 0x008C Œ () - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x0091 ‘ () - 18 0x0092 ’ () - 19 0x0093 “ () - 20 0x0094 ” () - 21 0x0095 • () - 22 0x0096 – () - 23 0x0097 — () - 24 0x0098 ˜ () - 25 0x0099 ™ () - 26 0x009A š () - 27 0x009B › () - 28 0x009C œ () - 29 0x009D  () - 30 0x009E ž () - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 33 0x0401 Ё (CYRILLIC CAPITAL LETTER IO) - 34 0x0402 Ђ (CYRILLIC CAPITAL LETTER DJE) - 35 0x0403 Ѓ (CYRILLIC CAPITAL LETTER GJE) - 36 0x0404 Є (CYRILLIC CAPITAL LETTER UKRAINIAN IE) - 37 0x0405 Ѕ (CYRILLIC CAPITAL LETTER DZE) - 38 0x0406 І (CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I) - 39 0x0407 Ї (CYRILLIC CAPITAL LETTER YI) - 40 0x0408 Ј (CYRILLIC CAPITAL LETTER JE) - 41 0x0409 Љ (CYRILLIC CAPITAL LETTER LJE) - 42 0x040A Њ (CYRILLIC CAPITAL LETTER NJE) - 43 0x040B Ћ (CYRILLIC CAPITAL LETTER TSHE) - 44 0x040C Ќ (CYRILLIC CAPITAL LETTER KJE) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x040E Ў (CYRILLIC CAPITAL LETTER SHORT U) - 47 0x040F Џ (CYRILLIC CAPITAL LETTER DZHE) - 48 0x0410 А (CYRILLIC CAPITAL LETTER A) - 49 0x0411 Б (CYRILLIC CAPITAL LETTER BE) - 50 0x0412 В (CYRILLIC CAPITAL LETTER VE) - 51 0x0413 Г (CYRILLIC CAPITAL LETTER GHE) - 52 0x0414 Д (CYRILLIC CAPITAL LETTER DE) - 53 0x0415 Е (CYRILLIC CAPITAL LETTER IE) - 54 0x0416 Ж (CYRILLIC CAPITAL LETTER ZHE) - 55 0x0417 З (CYRILLIC CAPITAL LETTER ZE) - 56 0x0418 И (CYRILLIC CAPITAL LETTER I) - 57 0x0419 Й (CYRILLIC CAPITAL LETTER SHORT I) - 58 0x041A К (CYRILLIC CAPITAL LETTER KA) - 59 0x041B Л (CYRILLIC CAPITAL LETTER EL) - 60 0x041C М (CYRILLIC CAPITAL LETTER EM) - 61 0x041D Н (CYRILLIC CAPITAL LETTER EN) - 62 0x041E О (CYRILLIC CAPITAL LETTER O) - 63 0x041F П (CYRILLIC CAPITAL LETTER PE) - 64 0x0420 Р (CYRILLIC CAPITAL LETTER ER) - 65 0x0421 С (CYRILLIC CAPITAL LETTER ES) - 66 0x0422 Т (CYRILLIC CAPITAL LETTER TE) - 67 0x0423 У (CYRILLIC CAPITAL LETTER U) - 68 0x0424 Ф (CYRILLIC CAPITAL LETTER EF) - 69 0x0425 Х (CYRILLIC CAPITAL LETTER HA) - 70 0x0426 Ц (CYRILLIC CAPITAL LETTER TSE) - 71 0x0427 Ч (CYRILLIC CAPITAL LETTER CHE) - 72 0x0428 Ш (CYRILLIC CAPITAL LETTER SHA) - 73 0x0429 Щ (CYRILLIC CAPITAL LETTER SHCHA) - 74 0x042A Ъ (CYRILLIC CAPITAL LETTER HARD SIGN) - 75 0x042B Ы (CYRILLIC CAPITAL LETTER YERU) - 76 0x042C Ь (CYRILLIC CAPITAL LETTER SOFT SIGN) - 77 0x042D Э (CYRILLIC CAPITAL LETTER E) - 78 0x042E Ю (CYRILLIC CAPITAL LETTER YU) - 79 0x042F Я (CYRILLIC CAPITAL LETTER YA) - 80 0x0430 а (CYRILLIC SMALL LETTER A) - 81 0x0431 б (CYRILLIC SMALL LETTER BE) - 82 0x0432 в (CYRILLIC SMALL LETTER VE) - 83 0x0433 г (CYRILLIC SMALL LETTER GHE) - 84 0x0434 д (CYRILLIC SMALL LETTER DE) - 85 0x0435 е (CYRILLIC SMALL LETTER IE) - 86 0x0436 ж (CYRILLIC SMALL LETTER ZHE) - 87 0x0437 з (CYRILLIC SMALL LETTER ZE) - 88 0x0438 и (CYRILLIC SMALL LETTER I) - 89 0x0439 й (CYRILLIC SMALL LETTER SHORT I) - 90 0x043A к (CYRILLIC SMALL LETTER KA) - 91 0x043B л (CYRILLIC SMALL LETTER EL) - 92 0x043C м (CYRILLIC SMALL LETTER EM) - 93 0x043D н (CYRILLIC SMALL LETTER EN) - 94 0x043E о (CYRILLIC SMALL LETTER O) - 95 0x043F п (CYRILLIC SMALL LETTER PE) - 96 0x0440 р (CYRILLIC SMALL LETTER ER) - 97 0x0441 с (CYRILLIC SMALL LETTER ES) - 98 0x0442 т (CYRILLIC SMALL LETTER TE) - 99 0x0443 у (CYRILLIC SMALL LETTER U) -100 0x0444 ф (CYRILLIC SMALL LETTER EF) -101 0x0445 х (CYRILLIC SMALL LETTER HA) -102 0x0446 ц (CYRILLIC SMALL LETTER TSE) -103 0x0447 ч (CYRILLIC SMALL LETTER CHE) -104 0x0448 ш (CYRILLIC SMALL LETTER SHA) -105 0x0449 щ (CYRILLIC SMALL LETTER SHCHA) -106 0x044A ъ (CYRILLIC SMALL LETTER HARD SIGN) -107 0x044B ы (CYRILLIC SMALL LETTER YERU) -108 0x044C ь (CYRILLIC SMALL LETTER SOFT SIGN) -109 0x044D э (CYRILLIC SMALL LETTER E) -110 0x044E ю (CYRILLIC SMALL LETTER YU) -111 0x044F я (CYRILLIC SMALL LETTER YA) -112 0x2116 № (NUMERO SIGN) -113 0x0451 ё (CYRILLIC SMALL LETTER IO) -114 0x0452 ђ (CYRILLIC SMALL LETTER DJE) -115 0x0453 ѓ (CYRILLIC SMALL LETTER GJE) -116 0x0454 є (CYRILLIC SMALL LETTER UKRAINIAN IE) -117 0x0455 ѕ (CYRILLIC SMALL LETTER DZE) -118 0x0456 і (CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I) -119 0x0457 ї (CYRILLIC SMALL LETTER YI) -120 0x0458 ј (CYRILLIC SMALL LETTER JE) -121 0x0459 љ (CYRILLIC SMALL LETTER LJE) -122 0x045A њ (CYRILLIC SMALL LETTER NJE) -123 0x045B ћ (CYRILLIC SMALL LETTER TSHE) -124 0x045C ќ (CYRILLIC SMALL LETTER KJE) -125 0x00A7 § (SECTION SIGN) -126 0x045E ў (CYRILLIC SMALL LETTER SHORT U) -127 0x045F џ (CYRILLIC SMALL LETTER DZHE) diff --git a/sub_crates/text_encoding/encoding_tables/index-iso-8859-6.txt b/sub_crates/text_encoding/encoding_tables/index-iso-8859-6.txt deleted file mode 100644 index a0691ff..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-iso-8859-6.txt +++ /dev/null @@ -1,89 +0,0 @@ -# For details on index index-iso-8859-6.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: 85bb7b5c2dc75975afebe5743935ba4ed5a09c1e9e34e9bfb2ff80293f5d8bbc -# Date: 2018-01-06 - - 0 0x0080 € () - 1 0x0081  () - 2 0x0082 ‚ () - 3 0x0083 ƒ () - 4 0x0084 „ () - 5 0x0085 … () - 6 0x0086 † () - 7 0x0087 ‡ () - 8 0x0088 ˆ () - 9 0x0089 ‰ () - 10 0x008A Š () - 11 0x008B ‹ () - 12 0x008C Œ () - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x0091 ‘ () - 18 0x0092 ’ () - 19 0x0093 “ () - 20 0x0094 ” () - 21 0x0095 • () - 22 0x0096 – () - 23 0x0097 — () - 24 0x0098 ˜ () - 25 0x0099 ™ () - 26 0x009A š () - 27 0x009B › () - 28 0x009C œ () - 29 0x009D  () - 30 0x009E ž () - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 36 0x00A4 ¤ (CURRENCY SIGN) - 44 0x060C ، (ARABIC COMMA) - 45 0x00AD ­ (SOFT HYPHEN) - 59 0x061B ؛ (ARABIC SEMICOLON) - 63 0x061F ؟ (ARABIC QUESTION MARK) - 65 0x0621 ء (ARABIC LETTER HAMZA) - 66 0x0622 آ (ARABIC LETTER ALEF WITH MADDA ABOVE) - 67 0x0623 أ (ARABIC LETTER ALEF WITH HAMZA ABOVE) - 68 0x0624 ؤ (ARABIC LETTER WAW WITH HAMZA ABOVE) - 69 0x0625 إ (ARABIC LETTER ALEF WITH HAMZA BELOW) - 70 0x0626 ئ (ARABIC LETTER YEH WITH HAMZA ABOVE) - 71 0x0627 ا (ARABIC LETTER ALEF) - 72 0x0628 ب (ARABIC LETTER BEH) - 73 0x0629 ة (ARABIC LETTER TEH MARBUTA) - 74 0x062A ت (ARABIC LETTER TEH) - 75 0x062B ث (ARABIC LETTER THEH) - 76 0x062C ج (ARABIC LETTER JEEM) - 77 0x062D ح (ARABIC LETTER HAH) - 78 0x062E خ (ARABIC LETTER KHAH) - 79 0x062F د (ARABIC LETTER DAL) - 80 0x0630 ذ (ARABIC LETTER THAL) - 81 0x0631 ر (ARABIC LETTER REH) - 82 0x0632 ز (ARABIC LETTER ZAIN) - 83 0x0633 س (ARABIC LETTER SEEN) - 84 0x0634 ش (ARABIC LETTER SHEEN) - 85 0x0635 ص (ARABIC LETTER SAD) - 86 0x0636 ض (ARABIC LETTER DAD) - 87 0x0637 ط (ARABIC LETTER TAH) - 88 0x0638 ظ (ARABIC LETTER ZAH) - 89 0x0639 ع (ARABIC LETTER AIN) - 90 0x063A غ (ARABIC LETTER GHAIN) - 96 0x0640 ـ (ARABIC TATWEEL) - 97 0x0641 ف (ARABIC LETTER FEH) - 98 0x0642 ق (ARABIC LETTER QAF) - 99 0x0643 ك (ARABIC LETTER KAF) -100 0x0644 ل (ARABIC LETTER LAM) -101 0x0645 م (ARABIC LETTER MEEM) -102 0x0646 ن (ARABIC LETTER NOON) -103 0x0647 ه (ARABIC LETTER HEH) -104 0x0648 و (ARABIC LETTER WAW) -105 0x0649 ى (ARABIC LETTER ALEF MAKSURA) -106 0x064A ي (ARABIC LETTER YEH) -107 0x064B ً (ARABIC FATHATAN) -108 0x064C ٌ (ARABIC DAMMATAN) -109 0x064D ٍ (ARABIC KASRATAN) -110 0x064E َ (ARABIC FATHA) -111 0x064F ُ (ARABIC DAMMA) -112 0x0650 ِ (ARABIC KASRA) -113 0x0651 ّ (ARABIC SHADDA) -114 0x0652 ْ (ARABIC SUKUN) diff --git a/sub_crates/text_encoding/encoding_tables/index-iso-8859-7.txt b/sub_crates/text_encoding/encoding_tables/index-iso-8859-7.txt deleted file mode 100644 index 5fd6737..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-iso-8859-7.txt +++ /dev/null @@ -1,131 +0,0 @@ -# For details on index index-iso-8859-7.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: f53d8aeba36314ef950eef02ffcf11dff540638ce27dfe7a86b6ccc6875afb24 -# Date: 2018-01-06 - - 0 0x0080 € () - 1 0x0081  () - 2 0x0082 ‚ () - 3 0x0083 ƒ () - 4 0x0084 „ () - 5 0x0085 … () - 6 0x0086 † () - 7 0x0087 ‡ () - 8 0x0088 ˆ () - 9 0x0089 ‰ () - 10 0x008A Š () - 11 0x008B ‹ () - 12 0x008C Œ () - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x0091 ‘ () - 18 0x0092 ’ () - 19 0x0093 “ () - 20 0x0094 ” () - 21 0x0095 • () - 22 0x0096 – () - 23 0x0097 — () - 24 0x0098 ˜ () - 25 0x0099 ™ () - 26 0x009A š () - 27 0x009B › () - 28 0x009C œ () - 29 0x009D  () - 30 0x009E ž () - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 33 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) - 34 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) - 35 0x00A3 £ (POUND SIGN) - 36 0x20AC € (EURO SIGN) - 37 0x20AF ₯ (DRACHMA SIGN) - 38 0x00A6 ¦ (BROKEN BAR) - 39 0x00A7 § (SECTION SIGN) - 40 0x00A8 ¨ (DIAERESIS) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x037A ͺ (GREEK YPOGEGRAMMENI) - 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 44 0x00AC ¬ (NOT SIGN) - 45 0x00AD ­ (SOFT HYPHEN) - 47 0x2015 ― (HORIZONTAL BAR) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x00B2 ² (SUPERSCRIPT TWO) - 51 0x00B3 ³ (SUPERSCRIPT THREE) - 52 0x0384 ΄ (GREEK TONOS) - 53 0x0385 ΅ (GREEK DIALYTIKA TONOS) - 54 0x0386 Ά (GREEK CAPITAL LETTER ALPHA WITH TONOS) - 55 0x00B7 · (MIDDLE DOT) - 56 0x0388 Έ (GREEK CAPITAL LETTER EPSILON WITH TONOS) - 57 0x0389 Ή (GREEK CAPITAL LETTER ETA WITH TONOS) - 58 0x038A Ί (GREEK CAPITAL LETTER IOTA WITH TONOS) - 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 60 0x038C Ό (GREEK CAPITAL LETTER OMICRON WITH TONOS) - 61 0x00BD ½ (VULGAR FRACTION ONE HALF) - 62 0x038E Ύ (GREEK CAPITAL LETTER UPSILON WITH TONOS) - 63 0x038F Ώ (GREEK CAPITAL LETTER OMEGA WITH TONOS) - 64 0x0390 ΐ (GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS) - 65 0x0391 Α (GREEK CAPITAL LETTER ALPHA) - 66 0x0392 Β (GREEK CAPITAL LETTER BETA) - 67 0x0393 Γ (GREEK CAPITAL LETTER GAMMA) - 68 0x0394 Δ (GREEK CAPITAL LETTER DELTA) - 69 0x0395 Ε (GREEK CAPITAL LETTER EPSILON) - 70 0x0396 Ζ (GREEK CAPITAL LETTER ZETA) - 71 0x0397 Η (GREEK CAPITAL LETTER ETA) - 72 0x0398 Θ (GREEK CAPITAL LETTER THETA) - 73 0x0399 Ι (GREEK CAPITAL LETTER IOTA) - 74 0x039A Κ (GREEK CAPITAL LETTER KAPPA) - 75 0x039B Λ (GREEK CAPITAL LETTER LAMDA) - 76 0x039C Μ (GREEK CAPITAL LETTER MU) - 77 0x039D Ν (GREEK CAPITAL LETTER NU) - 78 0x039E Ξ (GREEK CAPITAL LETTER XI) - 79 0x039F Ο (GREEK CAPITAL LETTER OMICRON) - 80 0x03A0 Π (GREEK CAPITAL LETTER PI) - 81 0x03A1 Ρ (GREEK CAPITAL LETTER RHO) - 83 0x03A3 Σ (GREEK CAPITAL LETTER SIGMA) - 84 0x03A4 Τ (GREEK CAPITAL LETTER TAU) - 85 0x03A5 Υ (GREEK CAPITAL LETTER UPSILON) - 86 0x03A6 Φ (GREEK CAPITAL LETTER PHI) - 87 0x03A7 Χ (GREEK CAPITAL LETTER CHI) - 88 0x03A8 Ψ (GREEK CAPITAL LETTER PSI) - 89 0x03A9 Ω (GREEK CAPITAL LETTER OMEGA) - 90 0x03AA Ϊ (GREEK CAPITAL LETTER IOTA WITH DIALYTIKA) - 91 0x03AB Ϋ (GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA) - 92 0x03AC ά (GREEK SMALL LETTER ALPHA WITH TONOS) - 93 0x03AD έ (GREEK SMALL LETTER EPSILON WITH TONOS) - 94 0x03AE ή (GREEK SMALL LETTER ETA WITH TONOS) - 95 0x03AF ί (GREEK SMALL LETTER IOTA WITH TONOS) - 96 0x03B0 ΰ (GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS) - 97 0x03B1 α (GREEK SMALL LETTER ALPHA) - 98 0x03B2 β (GREEK SMALL LETTER BETA) - 99 0x03B3 γ (GREEK SMALL LETTER GAMMA) -100 0x03B4 δ (GREEK SMALL LETTER DELTA) -101 0x03B5 ε (GREEK SMALL LETTER EPSILON) -102 0x03B6 ζ (GREEK SMALL LETTER ZETA) -103 0x03B7 η (GREEK SMALL LETTER ETA) -104 0x03B8 θ (GREEK SMALL LETTER THETA) -105 0x03B9 ι (GREEK SMALL LETTER IOTA) -106 0x03BA κ (GREEK SMALL LETTER KAPPA) -107 0x03BB λ (GREEK SMALL LETTER LAMDA) -108 0x03BC μ (GREEK SMALL LETTER MU) -109 0x03BD ν (GREEK SMALL LETTER NU) -110 0x03BE ξ (GREEK SMALL LETTER XI) -111 0x03BF ο (GREEK SMALL LETTER OMICRON) -112 0x03C0 π (GREEK SMALL LETTER PI) -113 0x03C1 ρ (GREEK SMALL LETTER RHO) -114 0x03C2 ς (GREEK SMALL LETTER FINAL SIGMA) -115 0x03C3 σ (GREEK SMALL LETTER SIGMA) -116 0x03C4 τ (GREEK SMALL LETTER TAU) -117 0x03C5 υ (GREEK SMALL LETTER UPSILON) -118 0x03C6 φ (GREEK SMALL LETTER PHI) -119 0x03C7 χ (GREEK SMALL LETTER CHI) -120 0x03C8 ψ (GREEK SMALL LETTER PSI) -121 0x03C9 ω (GREEK SMALL LETTER OMEGA) -122 0x03CA ϊ (GREEK SMALL LETTER IOTA WITH DIALYTIKA) -123 0x03CB ϋ (GREEK SMALL LETTER UPSILON WITH DIALYTIKA) -124 0x03CC ό (GREEK SMALL LETTER OMICRON WITH TONOS) -125 0x03CD ύ (GREEK SMALL LETTER UPSILON WITH TONOS) -126 0x03CE ώ (GREEK SMALL LETTER OMEGA WITH TONOS) diff --git a/sub_crates/text_encoding/encoding_tables/index-iso-8859-8.txt b/sub_crates/text_encoding/encoding_tables/index-iso-8859-8.txt deleted file mode 100644 index 5aedc57..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-iso-8859-8.txt +++ /dev/null @@ -1,98 +0,0 @@ -# For details on index index-iso-8859-8.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: 7657a9ca3fa875990da960d3f812eea28dcd0ae6ed55a18d5394303c86f5484b -# Date: 2018-01-06 - - 0 0x0080 € () - 1 0x0081  () - 2 0x0082 ‚ () - 3 0x0083 ƒ () - 4 0x0084 „ () - 5 0x0085 … () - 6 0x0086 † () - 7 0x0087 ‡ () - 8 0x0088 ˆ () - 9 0x0089 ‰ () - 10 0x008A Š () - 11 0x008B ‹ () - 12 0x008C Œ () - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x0091 ‘ () - 18 0x0092 ’ () - 19 0x0093 “ () - 20 0x0094 ” () - 21 0x0095 • () - 22 0x0096 – () - 23 0x0097 — () - 24 0x0098 ˜ () - 25 0x0099 ™ () - 26 0x009A š () - 27 0x009B › () - 28 0x009C œ () - 29 0x009D  () - 30 0x009E ž () - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 34 0x00A2 ¢ (CENT SIGN) - 35 0x00A3 £ (POUND SIGN) - 36 0x00A4 ¤ (CURRENCY SIGN) - 37 0x00A5 ¥ (YEN SIGN) - 38 0x00A6 ¦ (BROKEN BAR) - 39 0x00A7 § (SECTION SIGN) - 40 0x00A8 ¨ (DIAERESIS) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x00D7 × (MULTIPLICATION SIGN) - 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 44 0x00AC ¬ (NOT SIGN) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x00AE ® (REGISTERED SIGN) - 47 0x00AF ¯ (MACRON) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x00B2 ² (SUPERSCRIPT TWO) - 51 0x00B3 ³ (SUPERSCRIPT THREE) - 52 0x00B4 ´ (ACUTE ACCENT) - 53 0x00B5 µ (MICRO SIGN) - 54 0x00B6 ¶ (PILCROW SIGN) - 55 0x00B7 · (MIDDLE DOT) - 56 0x00B8 ¸ (CEDILLA) - 57 0x00B9 ¹ (SUPERSCRIPT ONE) - 58 0x00F7 ÷ (DIVISION SIGN) - 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) - 61 0x00BD ½ (VULGAR FRACTION ONE HALF) - 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) - 95 0x2017 ‗ (DOUBLE LOW LINE) - 96 0x05D0 א (HEBREW LETTER ALEF) - 97 0x05D1 ב (HEBREW LETTER BET) - 98 0x05D2 ג (HEBREW LETTER GIMEL) - 99 0x05D3 ד (HEBREW LETTER DALET) -100 0x05D4 ה (HEBREW LETTER HE) -101 0x05D5 ו (HEBREW LETTER VAV) -102 0x05D6 ז (HEBREW LETTER ZAYIN) -103 0x05D7 ח (HEBREW LETTER HET) -104 0x05D8 ט (HEBREW LETTER TET) -105 0x05D9 י (HEBREW LETTER YOD) -106 0x05DA ך (HEBREW LETTER FINAL KAF) -107 0x05DB כ (HEBREW LETTER KAF) -108 0x05DC ל (HEBREW LETTER LAMED) -109 0x05DD ם (HEBREW LETTER FINAL MEM) -110 0x05DE מ (HEBREW LETTER MEM) -111 0x05DF ן (HEBREW LETTER FINAL NUN) -112 0x05E0 נ (HEBREW LETTER NUN) -113 0x05E1 ס (HEBREW LETTER SAMEKH) -114 0x05E2 ע (HEBREW LETTER AYIN) -115 0x05E3 ף (HEBREW LETTER FINAL PE) -116 0x05E4 פ (HEBREW LETTER PE) -117 0x05E5 ץ (HEBREW LETTER FINAL TSADI) -118 0x05E6 צ (HEBREW LETTER TSADI) -119 0x05E7 ק (HEBREW LETTER QOF) -120 0x05E8 ר (HEBREW LETTER RESH) -121 0x05E9 ש (HEBREW LETTER SHIN) -122 0x05EA ת (HEBREW LETTER TAV) -125 0x200E ‎ (LEFT-TO-RIGHT MARK) -126 0x200F ‏ (RIGHT-TO-LEFT MARK) diff --git a/sub_crates/text_encoding/encoding_tables/index-koi8-r.txt b/sub_crates/text_encoding/encoding_tables/index-koi8-r.txt deleted file mode 100644 index 639e9c4..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-koi8-r.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-koi8-r.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: c5497cd9071cb352c0e56b219154e539badf63de40b71578f09e2e11fe7d50ae -# Date: 2018-01-06 - - 0 0x2500 ─ (BOX DRAWINGS LIGHT HORIZONTAL) - 1 0x2502 │ (BOX DRAWINGS LIGHT VERTICAL) - 2 0x250C ┌ (BOX DRAWINGS LIGHT DOWN AND RIGHT) - 3 0x2510 ┐ (BOX DRAWINGS LIGHT DOWN AND LEFT) - 4 0x2514 └ (BOX DRAWINGS LIGHT UP AND RIGHT) - 5 0x2518 ┘ (BOX DRAWINGS LIGHT UP AND LEFT) - 6 0x251C ├ (BOX DRAWINGS LIGHT VERTICAL AND RIGHT) - 7 0x2524 ┤ (BOX DRAWINGS LIGHT VERTICAL AND LEFT) - 8 0x252C ┬ (BOX DRAWINGS LIGHT DOWN AND HORIZONTAL) - 9 0x2534 ┴ (BOX DRAWINGS LIGHT UP AND HORIZONTAL) - 10 0x253C ┼ (BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL) - 11 0x2580 ▀ (UPPER HALF BLOCK) - 12 0x2584 ▄ (LOWER HALF BLOCK) - 13 0x2588 █ (FULL BLOCK) - 14 0x258C ▌ (LEFT HALF BLOCK) - 15 0x2590 ▐ (RIGHT HALF BLOCK) - 16 0x2591 ░ (LIGHT SHADE) - 17 0x2592 ▒ (MEDIUM SHADE) - 18 0x2593 ▓ (DARK SHADE) - 19 0x2320 ⌠ (TOP HALF INTEGRAL) - 20 0x25A0 ■ (BLACK SQUARE) - 21 0x2219 ∙ (BULLET OPERATOR) - 22 0x221A √ (SQUARE ROOT) - 23 0x2248 ≈ (ALMOST EQUAL TO) - 24 0x2264 ≤ (LESS-THAN OR EQUAL TO) - 25 0x2265 ≥ (GREATER-THAN OR EQUAL TO) - 26 0x00A0   (NO-BREAK SPACE) - 27 0x2321 ⌡ (BOTTOM HALF INTEGRAL) - 28 0x00B0 ° (DEGREE SIGN) - 29 0x00B2 ² (SUPERSCRIPT TWO) - 30 0x00B7 · (MIDDLE DOT) - 31 0x00F7 ÷ (DIVISION SIGN) - 32 0x2550 ═ (BOX DRAWINGS DOUBLE HORIZONTAL) - 33 0x2551 ║ (BOX DRAWINGS DOUBLE VERTICAL) - 34 0x2552 ╒ (BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE) - 35 0x0451 ё (CYRILLIC SMALL LETTER IO) - 36 0x2553 ╓ (BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE) - 37 0x2554 ╔ (BOX DRAWINGS DOUBLE DOWN AND RIGHT) - 38 0x2555 ╕ (BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE) - 39 0x2556 ╖ (BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE) - 40 0x2557 ╗ (BOX DRAWINGS DOUBLE DOWN AND LEFT) - 41 0x2558 ╘ (BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE) - 42 0x2559 ╙ (BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE) - 43 0x255A ╚ (BOX DRAWINGS DOUBLE UP AND RIGHT) - 44 0x255B ╛ (BOX DRAWINGS UP SINGLE AND LEFT DOUBLE) - 45 0x255C ╜ (BOX DRAWINGS UP DOUBLE AND LEFT SINGLE) - 46 0x255D ╝ (BOX DRAWINGS DOUBLE UP AND LEFT) - 47 0x255E ╞ (BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE) - 48 0x255F ╟ (BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE) - 49 0x2560 ╠ (BOX DRAWINGS DOUBLE VERTICAL AND RIGHT) - 50 0x2561 ╡ (BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE) - 51 0x0401 Ё (CYRILLIC CAPITAL LETTER IO) - 52 0x2562 ╢ (BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE) - 53 0x2563 ╣ (BOX DRAWINGS DOUBLE VERTICAL AND LEFT) - 54 0x2564 ╤ (BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE) - 55 0x2565 ╥ (BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE) - 56 0x2566 ╦ (BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL) - 57 0x2567 ╧ (BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE) - 58 0x2568 ╨ (BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE) - 59 0x2569 ╩ (BOX DRAWINGS DOUBLE UP AND HORIZONTAL) - 60 0x256A ╪ (BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE) - 61 0x256B ╫ (BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE) - 62 0x256C ╬ (BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL) - 63 0x00A9 © (COPYRIGHT SIGN) - 64 0x044E ю (CYRILLIC SMALL LETTER YU) - 65 0x0430 а (CYRILLIC SMALL LETTER A) - 66 0x0431 б (CYRILLIC SMALL LETTER BE) - 67 0x0446 ц (CYRILLIC SMALL LETTER TSE) - 68 0x0434 д (CYRILLIC SMALL LETTER DE) - 69 0x0435 е (CYRILLIC SMALL LETTER IE) - 70 0x0444 ф (CYRILLIC SMALL LETTER EF) - 71 0x0433 г (CYRILLIC SMALL LETTER GHE) - 72 0x0445 х (CYRILLIC SMALL LETTER HA) - 73 0x0438 и (CYRILLIC SMALL LETTER I) - 74 0x0439 й (CYRILLIC SMALL LETTER SHORT I) - 75 0x043A к (CYRILLIC SMALL LETTER KA) - 76 0x043B л (CYRILLIC SMALL LETTER EL) - 77 0x043C м (CYRILLIC SMALL LETTER EM) - 78 0x043D н (CYRILLIC SMALL LETTER EN) - 79 0x043E о (CYRILLIC SMALL LETTER O) - 80 0x043F п (CYRILLIC SMALL LETTER PE) - 81 0x044F я (CYRILLIC SMALL LETTER YA) - 82 0x0440 р (CYRILLIC SMALL LETTER ER) - 83 0x0441 с (CYRILLIC SMALL LETTER ES) - 84 0x0442 т (CYRILLIC SMALL LETTER TE) - 85 0x0443 у (CYRILLIC SMALL LETTER U) - 86 0x0436 ж (CYRILLIC SMALL LETTER ZHE) - 87 0x0432 в (CYRILLIC SMALL LETTER VE) - 88 0x044C ь (CYRILLIC SMALL LETTER SOFT SIGN) - 89 0x044B ы (CYRILLIC SMALL LETTER YERU) - 90 0x0437 з (CYRILLIC SMALL LETTER ZE) - 91 0x0448 ш (CYRILLIC SMALL LETTER SHA) - 92 0x044D э (CYRILLIC SMALL LETTER E) - 93 0x0449 щ (CYRILLIC SMALL LETTER SHCHA) - 94 0x0447 ч (CYRILLIC SMALL LETTER CHE) - 95 0x044A ъ (CYRILLIC SMALL LETTER HARD SIGN) - 96 0x042E Ю (CYRILLIC CAPITAL LETTER YU) - 97 0x0410 А (CYRILLIC CAPITAL LETTER A) - 98 0x0411 Б (CYRILLIC CAPITAL LETTER BE) - 99 0x0426 Ц (CYRILLIC CAPITAL LETTER TSE) -100 0x0414 Д (CYRILLIC CAPITAL LETTER DE) -101 0x0415 Е (CYRILLIC CAPITAL LETTER IE) -102 0x0424 Ф (CYRILLIC CAPITAL LETTER EF) -103 0x0413 Г (CYRILLIC CAPITAL LETTER GHE) -104 0x0425 Х (CYRILLIC CAPITAL LETTER HA) -105 0x0418 И (CYRILLIC CAPITAL LETTER I) -106 0x0419 Й (CYRILLIC CAPITAL LETTER SHORT I) -107 0x041A К (CYRILLIC CAPITAL LETTER KA) -108 0x041B Л (CYRILLIC CAPITAL LETTER EL) -109 0x041C М (CYRILLIC CAPITAL LETTER EM) -110 0x041D Н (CYRILLIC CAPITAL LETTER EN) -111 0x041E О (CYRILLIC CAPITAL LETTER O) -112 0x041F П (CYRILLIC CAPITAL LETTER PE) -113 0x042F Я (CYRILLIC CAPITAL LETTER YA) -114 0x0420 Р (CYRILLIC CAPITAL LETTER ER) -115 0x0421 С (CYRILLIC CAPITAL LETTER ES) -116 0x0422 Т (CYRILLIC CAPITAL LETTER TE) -117 0x0423 У (CYRILLIC CAPITAL LETTER U) -118 0x0416 Ж (CYRILLIC CAPITAL LETTER ZHE) -119 0x0412 В (CYRILLIC CAPITAL LETTER VE) -120 0x042C Ь (CYRILLIC CAPITAL LETTER SOFT SIGN) -121 0x042B Ы (CYRILLIC CAPITAL LETTER YERU) -122 0x0417 З (CYRILLIC CAPITAL LETTER ZE) -123 0x0428 Ш (CYRILLIC CAPITAL LETTER SHA) -124 0x042D Э (CYRILLIC CAPITAL LETTER E) -125 0x0429 Щ (CYRILLIC CAPITAL LETTER SHCHA) -126 0x0427 Ч (CYRILLIC CAPITAL LETTER CHE) -127 0x042A Ъ (CYRILLIC CAPITAL LETTER HARD SIGN) diff --git a/sub_crates/text_encoding/encoding_tables/index-koi8-u.txt b/sub_crates/text_encoding/encoding_tables/index-koi8-u.txt deleted file mode 100644 index 6654e43..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-koi8-u.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-koi8-u.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: 19a4da2c3f245118bbc8019326f45a07832949938ff903f03d62ac4da1f61f40 -# Date: 2018-01-06 - - 0 0x2500 ─ (BOX DRAWINGS LIGHT HORIZONTAL) - 1 0x2502 │ (BOX DRAWINGS LIGHT VERTICAL) - 2 0x250C ┌ (BOX DRAWINGS LIGHT DOWN AND RIGHT) - 3 0x2510 ┐ (BOX DRAWINGS LIGHT DOWN AND LEFT) - 4 0x2514 └ (BOX DRAWINGS LIGHT UP AND RIGHT) - 5 0x2518 ┘ (BOX DRAWINGS LIGHT UP AND LEFT) - 6 0x251C ├ (BOX DRAWINGS LIGHT VERTICAL AND RIGHT) - 7 0x2524 ┤ (BOX DRAWINGS LIGHT VERTICAL AND LEFT) - 8 0x252C ┬ (BOX DRAWINGS LIGHT DOWN AND HORIZONTAL) - 9 0x2534 ┴ (BOX DRAWINGS LIGHT UP AND HORIZONTAL) - 10 0x253C ┼ (BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL) - 11 0x2580 ▀ (UPPER HALF BLOCK) - 12 0x2584 ▄ (LOWER HALF BLOCK) - 13 0x2588 █ (FULL BLOCK) - 14 0x258C ▌ (LEFT HALF BLOCK) - 15 0x2590 ▐ (RIGHT HALF BLOCK) - 16 0x2591 ░ (LIGHT SHADE) - 17 0x2592 ▒ (MEDIUM SHADE) - 18 0x2593 ▓ (DARK SHADE) - 19 0x2320 ⌠ (TOP HALF INTEGRAL) - 20 0x25A0 ■ (BLACK SQUARE) - 21 0x2219 ∙ (BULLET OPERATOR) - 22 0x221A √ (SQUARE ROOT) - 23 0x2248 ≈ (ALMOST EQUAL TO) - 24 0x2264 ≤ (LESS-THAN OR EQUAL TO) - 25 0x2265 ≥ (GREATER-THAN OR EQUAL TO) - 26 0x00A0   (NO-BREAK SPACE) - 27 0x2321 ⌡ (BOTTOM HALF INTEGRAL) - 28 0x00B0 ° (DEGREE SIGN) - 29 0x00B2 ² (SUPERSCRIPT TWO) - 30 0x00B7 · (MIDDLE DOT) - 31 0x00F7 ÷ (DIVISION SIGN) - 32 0x2550 ═ (BOX DRAWINGS DOUBLE HORIZONTAL) - 33 0x2551 ║ (BOX DRAWINGS DOUBLE VERTICAL) - 34 0x2552 ╒ (BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE) - 35 0x0451 ё (CYRILLIC SMALL LETTER IO) - 36 0x0454 є (CYRILLIC SMALL LETTER UKRAINIAN IE) - 37 0x2554 ╔ (BOX DRAWINGS DOUBLE DOWN AND RIGHT) - 38 0x0456 і (CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I) - 39 0x0457 ї (CYRILLIC SMALL LETTER YI) - 40 0x2557 ╗ (BOX DRAWINGS DOUBLE DOWN AND LEFT) - 41 0x2558 ╘ (BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE) - 42 0x2559 ╙ (BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE) - 43 0x255A ╚ (BOX DRAWINGS DOUBLE UP AND RIGHT) - 44 0x255B ╛ (BOX DRAWINGS UP SINGLE AND LEFT DOUBLE) - 45 0x0491 ґ (CYRILLIC SMALL LETTER GHE WITH UPTURN) - 46 0x045E ў (CYRILLIC SMALL LETTER SHORT U) - 47 0x255E ╞ (BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE) - 48 0x255F ╟ (BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE) - 49 0x2560 ╠ (BOX DRAWINGS DOUBLE VERTICAL AND RIGHT) - 50 0x2561 ╡ (BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE) - 51 0x0401 Ё (CYRILLIC CAPITAL LETTER IO) - 52 0x0404 Є (CYRILLIC CAPITAL LETTER UKRAINIAN IE) - 53 0x2563 ╣ (BOX DRAWINGS DOUBLE VERTICAL AND LEFT) - 54 0x0406 І (CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I) - 55 0x0407 Ї (CYRILLIC CAPITAL LETTER YI) - 56 0x2566 ╦ (BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL) - 57 0x2567 ╧ (BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE) - 58 0x2568 ╨ (BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE) - 59 0x2569 ╩ (BOX DRAWINGS DOUBLE UP AND HORIZONTAL) - 60 0x256A ╪ (BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE) - 61 0x0490 Ґ (CYRILLIC CAPITAL LETTER GHE WITH UPTURN) - 62 0x040E Ў (CYRILLIC CAPITAL LETTER SHORT U) - 63 0x00A9 © (COPYRIGHT SIGN) - 64 0x044E ю (CYRILLIC SMALL LETTER YU) - 65 0x0430 а (CYRILLIC SMALL LETTER A) - 66 0x0431 б (CYRILLIC SMALL LETTER BE) - 67 0x0446 ц (CYRILLIC SMALL LETTER TSE) - 68 0x0434 д (CYRILLIC SMALL LETTER DE) - 69 0x0435 е (CYRILLIC SMALL LETTER IE) - 70 0x0444 ф (CYRILLIC SMALL LETTER EF) - 71 0x0433 г (CYRILLIC SMALL LETTER GHE) - 72 0x0445 х (CYRILLIC SMALL LETTER HA) - 73 0x0438 и (CYRILLIC SMALL LETTER I) - 74 0x0439 й (CYRILLIC SMALL LETTER SHORT I) - 75 0x043A к (CYRILLIC SMALL LETTER KA) - 76 0x043B л (CYRILLIC SMALL LETTER EL) - 77 0x043C м (CYRILLIC SMALL LETTER EM) - 78 0x043D н (CYRILLIC SMALL LETTER EN) - 79 0x043E о (CYRILLIC SMALL LETTER O) - 80 0x043F п (CYRILLIC SMALL LETTER PE) - 81 0x044F я (CYRILLIC SMALL LETTER YA) - 82 0x0440 р (CYRILLIC SMALL LETTER ER) - 83 0x0441 с (CYRILLIC SMALL LETTER ES) - 84 0x0442 т (CYRILLIC SMALL LETTER TE) - 85 0x0443 у (CYRILLIC SMALL LETTER U) - 86 0x0436 ж (CYRILLIC SMALL LETTER ZHE) - 87 0x0432 в (CYRILLIC SMALL LETTER VE) - 88 0x044C ь (CYRILLIC SMALL LETTER SOFT SIGN) - 89 0x044B ы (CYRILLIC SMALL LETTER YERU) - 90 0x0437 з (CYRILLIC SMALL LETTER ZE) - 91 0x0448 ш (CYRILLIC SMALL LETTER SHA) - 92 0x044D э (CYRILLIC SMALL LETTER E) - 93 0x0449 щ (CYRILLIC SMALL LETTER SHCHA) - 94 0x0447 ч (CYRILLIC SMALL LETTER CHE) - 95 0x044A ъ (CYRILLIC SMALL LETTER HARD SIGN) - 96 0x042E Ю (CYRILLIC CAPITAL LETTER YU) - 97 0x0410 А (CYRILLIC CAPITAL LETTER A) - 98 0x0411 Б (CYRILLIC CAPITAL LETTER BE) - 99 0x0426 Ц (CYRILLIC CAPITAL LETTER TSE) -100 0x0414 Д (CYRILLIC CAPITAL LETTER DE) -101 0x0415 Е (CYRILLIC CAPITAL LETTER IE) -102 0x0424 Ф (CYRILLIC CAPITAL LETTER EF) -103 0x0413 Г (CYRILLIC CAPITAL LETTER GHE) -104 0x0425 Х (CYRILLIC CAPITAL LETTER HA) -105 0x0418 И (CYRILLIC CAPITAL LETTER I) -106 0x0419 Й (CYRILLIC CAPITAL LETTER SHORT I) -107 0x041A К (CYRILLIC CAPITAL LETTER KA) -108 0x041B Л (CYRILLIC CAPITAL LETTER EL) -109 0x041C М (CYRILLIC CAPITAL LETTER EM) -110 0x041D Н (CYRILLIC CAPITAL LETTER EN) -111 0x041E О (CYRILLIC CAPITAL LETTER O) -112 0x041F П (CYRILLIC CAPITAL LETTER PE) -113 0x042F Я (CYRILLIC CAPITAL LETTER YA) -114 0x0420 Р (CYRILLIC CAPITAL LETTER ER) -115 0x0421 С (CYRILLIC CAPITAL LETTER ES) -116 0x0422 Т (CYRILLIC CAPITAL LETTER TE) -117 0x0423 У (CYRILLIC CAPITAL LETTER U) -118 0x0416 Ж (CYRILLIC CAPITAL LETTER ZHE) -119 0x0412 В (CYRILLIC CAPITAL LETTER VE) -120 0x042C Ь (CYRILLIC CAPITAL LETTER SOFT SIGN) -121 0x042B Ы (CYRILLIC CAPITAL LETTER YERU) -122 0x0417 З (CYRILLIC CAPITAL LETTER ZE) -123 0x0428 Ш (CYRILLIC CAPITAL LETTER SHA) -124 0x042D Э (CYRILLIC CAPITAL LETTER E) -125 0x0429 Щ (CYRILLIC CAPITAL LETTER SHCHA) -126 0x0427 Ч (CYRILLIC CAPITAL LETTER CHE) -127 0x042A Ъ (CYRILLIC CAPITAL LETTER HARD SIGN) diff --git a/sub_crates/text_encoding/encoding_tables/index-macintosh.txt b/sub_crates/text_encoding/encoding_tables/index-macintosh.txt deleted file mode 100644 index e841a89..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-macintosh.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-macintosh.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: f2c6a4f6406b3e86a50a5dba4d2b7dd48e2e33c0d82aefe764535c934ec11764 -# Date: 2018-01-06 - - 0 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) - 1 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) - 2 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) - 3 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) - 4 0x00D1 Ñ (LATIN CAPITAL LETTER N WITH TILDE) - 5 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) - 6 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) - 7 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) - 8 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) - 9 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) - 10 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) - 11 0x00E3 ã (LATIN SMALL LETTER A WITH TILDE) - 12 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) - 13 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) - 14 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) - 15 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) - 16 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) - 17 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) - 18 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) - 19 0x00EC ì (LATIN SMALL LETTER I WITH GRAVE) - 20 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) - 21 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) - 22 0x00F1 ñ (LATIN SMALL LETTER N WITH TILDE) - 23 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) - 24 0x00F2 ò (LATIN SMALL LETTER O WITH GRAVE) - 25 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) - 26 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) - 27 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) - 28 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) - 29 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) - 30 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) - 31 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) - 32 0x2020 † (DAGGER) - 33 0x00B0 ° (DEGREE SIGN) - 34 0x00A2 ¢ (CENT SIGN) - 35 0x00A3 £ (POUND SIGN) - 36 0x00A7 § (SECTION SIGN) - 37 0x2022 • (BULLET) - 38 0x00B6 ¶ (PILCROW SIGN) - 39 0x00DF ß (LATIN SMALL LETTER SHARP S) - 40 0x00AE ® (REGISTERED SIGN) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x2122 ™ (TRADE MARK SIGN) - 43 0x00B4 ´ (ACUTE ACCENT) - 44 0x00A8 ¨ (DIAERESIS) - 45 0x2260 ≠ (NOT EQUAL TO) - 46 0x00C6 Æ (LATIN CAPITAL LETTER AE) - 47 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) - 48 0x221E ∞ (INFINITY) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x2264 ≤ (LESS-THAN OR EQUAL TO) - 51 0x2265 ≥ (GREATER-THAN OR EQUAL TO) - 52 0x00A5 ¥ (YEN SIGN) - 53 0x00B5 µ (MICRO SIGN) - 54 0x2202 ∂ (PARTIAL DIFFERENTIAL) - 55 0x2211 ∑ (N-ARY SUMMATION) - 56 0x220F ∏ (N-ARY PRODUCT) - 57 0x03C0 π (GREEK SMALL LETTER PI) - 58 0x222B ∫ (INTEGRAL) - 59 0x00AA ª (FEMININE ORDINAL INDICATOR) - 60 0x00BA º (MASCULINE ORDINAL INDICATOR) - 61 0x03A9 Ω (GREEK CAPITAL LETTER OMEGA) - 62 0x00E6 æ (LATIN SMALL LETTER AE) - 63 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) - 64 0x00BF ¿ (INVERTED QUESTION MARK) - 65 0x00A1 ¡ (INVERTED EXCLAMATION MARK) - 66 0x00AC ¬ (NOT SIGN) - 67 0x221A √ (SQUARE ROOT) - 68 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) - 69 0x2248 ≈ (ALMOST EQUAL TO) - 70 0x2206 ∆ (INCREMENT) - 71 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 72 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 73 0x2026 … (HORIZONTAL ELLIPSIS) - 74 0x00A0   (NO-BREAK SPACE) - 75 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) - 76 0x00C3 à (LATIN CAPITAL LETTER A WITH TILDE) - 77 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) - 78 0x0152 Œ (LATIN CAPITAL LIGATURE OE) - 79 0x0153 œ (LATIN SMALL LIGATURE OE) - 80 0x2013 – (EN DASH) - 81 0x2014 — (EM DASH) - 82 0x201C “ (LEFT DOUBLE QUOTATION MARK) - 83 0x201D ” (RIGHT DOUBLE QUOTATION MARK) - 84 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) - 85 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) - 86 0x00F7 ÷ (DIVISION SIGN) - 87 0x25CA ◊ (LOZENGE) - 88 0x00FF ÿ (LATIN SMALL LETTER Y WITH DIAERESIS) - 89 0x0178 Ÿ (LATIN CAPITAL LETTER Y WITH DIAERESIS) - 90 0x2044 ⁄ (FRACTION SLASH) - 91 0x20AC € (EURO SIGN) - 92 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) - 93 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) - 94 0xFB01 fi (LATIN SMALL LIGATURE FI) - 95 0xFB02 fl (LATIN SMALL LIGATURE FL) - 96 0x2021 ‡ (DOUBLE DAGGER) - 97 0x00B7 · (MIDDLE DOT) - 98 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) - 99 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) -100 0x2030 ‰ (PER MILLE SIGN) -101 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) -102 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) -103 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) -104 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) -105 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) -106 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) -107 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) -108 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) -109 0x00CC Ì (LATIN CAPITAL LETTER I WITH GRAVE) -110 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) -111 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) -112 0xF8FF  () -113 0x00D2 Ò (LATIN CAPITAL LETTER O WITH GRAVE) -114 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) -115 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) -116 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) -117 0x0131 ı (LATIN SMALL LETTER DOTLESS I) -118 0x02C6 ˆ (MODIFIER LETTER CIRCUMFLEX ACCENT) -119 0x02DC ˜ (SMALL TILDE) -120 0x00AF ¯ (MACRON) -121 0x02D8 ˘ (BREVE) -122 0x02D9 ˙ (DOT ABOVE) -123 0x02DA ˚ (RING ABOVE) -124 0x00B8 ¸ (CEDILLA) -125 0x02DD ˝ (DOUBLE ACUTE ACCENT) -126 0x02DB ˛ (OGONEK) -127 0x02C7 ˇ (CARON) diff --git a/sub_crates/text_encoding/encoding_tables/index-windows-1250.txt b/sub_crates/text_encoding/encoding_tables/index-windows-1250.txt deleted file mode 100644 index 870946a..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-windows-1250.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-windows-1250.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: 0669455a7a1c70ba6003ea737991e8ee9adc455125c13cfe6705a361358de5fa -# Date: 2018-01-06 - - 0 0x20AC € (EURO SIGN) - 1 0x0081  () - 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) - 3 0x0083 ƒ () - 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) - 5 0x2026 … (HORIZONTAL ELLIPSIS) - 6 0x2020 † (DAGGER) - 7 0x2021 ‡ (DOUBLE DAGGER) - 8 0x0088 ˆ () - 9 0x2030 ‰ (PER MILLE SIGN) - 10 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) - 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) - 12 0x015A Ś (LATIN CAPITAL LETTER S WITH ACUTE) - 13 0x0164 Ť (LATIN CAPITAL LETTER T WITH CARON) - 14 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) - 15 0x0179 Ź (LATIN CAPITAL LETTER Z WITH ACUTE) - 16 0x0090  () - 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) - 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) - 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) - 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) - 21 0x2022 • (BULLET) - 22 0x2013 – (EN DASH) - 23 0x2014 — (EM DASH) - 24 0x0098 ˜ () - 25 0x2122 ™ (TRADE MARK SIGN) - 26 0x0161 š (LATIN SMALL LETTER S WITH CARON) - 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) - 28 0x015B ś (LATIN SMALL LETTER S WITH ACUTE) - 29 0x0165 ť (LATIN SMALL LETTER T WITH CARON) - 30 0x017E ž (LATIN SMALL LETTER Z WITH CARON) - 31 0x017A ź (LATIN SMALL LETTER Z WITH ACUTE) - 32 0x00A0   (NO-BREAK SPACE) - 33 0x02C7 ˇ (CARON) - 34 0x02D8 ˘ (BREVE) - 35 0x0141 Ł (LATIN CAPITAL LETTER L WITH STROKE) - 36 0x00A4 ¤ (CURRENCY SIGN) - 37 0x0104 Ą (LATIN CAPITAL LETTER A WITH OGONEK) - 38 0x00A6 ¦ (BROKEN BAR) - 39 0x00A7 § (SECTION SIGN) - 40 0x00A8 ¨ (DIAERESIS) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x015E Ş (LATIN CAPITAL LETTER S WITH CEDILLA) - 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 44 0x00AC ¬ (NOT SIGN) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x00AE ® (REGISTERED SIGN) - 47 0x017B Ż (LATIN CAPITAL LETTER Z WITH DOT ABOVE) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x02DB ˛ (OGONEK) - 51 0x0142 ł (LATIN SMALL LETTER L WITH STROKE) - 52 0x00B4 ´ (ACUTE ACCENT) - 53 0x00B5 µ (MICRO SIGN) - 54 0x00B6 ¶ (PILCROW SIGN) - 55 0x00B7 · (MIDDLE DOT) - 56 0x00B8 ¸ (CEDILLA) - 57 0x0105 ą (LATIN SMALL LETTER A WITH OGONEK) - 58 0x015F ş (LATIN SMALL LETTER S WITH CEDILLA) - 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 60 0x013D Ľ (LATIN CAPITAL LETTER L WITH CARON) - 61 0x02DD ˝ (DOUBLE ACUTE ACCENT) - 62 0x013E ľ (LATIN SMALL LETTER L WITH CARON) - 63 0x017C ż (LATIN SMALL LETTER Z WITH DOT ABOVE) - 64 0x0154 Ŕ (LATIN CAPITAL LETTER R WITH ACUTE) - 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) - 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) - 67 0x0102 Ă (LATIN CAPITAL LETTER A WITH BREVE) - 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) - 69 0x0139 Ĺ (LATIN CAPITAL LETTER L WITH ACUTE) - 70 0x0106 Ć (LATIN CAPITAL LETTER C WITH ACUTE) - 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) - 72 0x010C Č (LATIN CAPITAL LETTER C WITH CARON) - 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) - 74 0x0118 Ę (LATIN CAPITAL LETTER E WITH OGONEK) - 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) - 76 0x011A Ě (LATIN CAPITAL LETTER E WITH CARON) - 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) - 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) - 79 0x010E Ď (LATIN CAPITAL LETTER D WITH CARON) - 80 0x0110 Đ (LATIN CAPITAL LETTER D WITH STROKE) - 81 0x0143 Ń (LATIN CAPITAL LETTER N WITH ACUTE) - 82 0x0147 Ň (LATIN CAPITAL LETTER N WITH CARON) - 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) - 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) - 85 0x0150 Ő (LATIN CAPITAL LETTER O WITH DOUBLE ACUTE) - 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) - 87 0x00D7 × (MULTIPLICATION SIGN) - 88 0x0158 Ř (LATIN CAPITAL LETTER R WITH CARON) - 89 0x016E Ů (LATIN CAPITAL LETTER U WITH RING ABOVE) - 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) - 91 0x0170 Ű (LATIN CAPITAL LETTER U WITH DOUBLE ACUTE) - 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) - 93 0x00DD Ý (LATIN CAPITAL LETTER Y WITH ACUTE) - 94 0x0162 Ţ (LATIN CAPITAL LETTER T WITH CEDILLA) - 95 0x00DF ß (LATIN SMALL LETTER SHARP S) - 96 0x0155 ŕ (LATIN SMALL LETTER R WITH ACUTE) - 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) - 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) - 99 0x0103 ă (LATIN SMALL LETTER A WITH BREVE) -100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) -101 0x013A ĺ (LATIN SMALL LETTER L WITH ACUTE) -102 0x0107 ć (LATIN SMALL LETTER C WITH ACUTE) -103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) -104 0x010D č (LATIN SMALL LETTER C WITH CARON) -105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) -106 0x0119 ę (LATIN SMALL LETTER E WITH OGONEK) -107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) -108 0x011B ě (LATIN SMALL LETTER E WITH CARON) -109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) -110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) -111 0x010F ď (LATIN SMALL LETTER D WITH CARON) -112 0x0111 đ (LATIN SMALL LETTER D WITH STROKE) -113 0x0144 ń (LATIN SMALL LETTER N WITH ACUTE) -114 0x0148 ň (LATIN SMALL LETTER N WITH CARON) -115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) -116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) -117 0x0151 ő (LATIN SMALL LETTER O WITH DOUBLE ACUTE) -118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) -119 0x00F7 ÷ (DIVISION SIGN) -120 0x0159 ř (LATIN SMALL LETTER R WITH CARON) -121 0x016F ů (LATIN SMALL LETTER U WITH RING ABOVE) -122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) -123 0x0171 ű (LATIN SMALL LETTER U WITH DOUBLE ACUTE) -124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) -125 0x00FD ý (LATIN SMALL LETTER Y WITH ACUTE) -126 0x0163 ţ (LATIN SMALL LETTER T WITH CEDILLA) -127 0x02D9 ˙ (DOT ABOVE) diff --git a/sub_crates/text_encoding/encoding_tables/index-windows-1251.txt b/sub_crates/text_encoding/encoding_tables/index-windows-1251.txt deleted file mode 100644 index 319e813..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-windows-1251.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-windows-1251.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: 7592ef921679ba168b00a9e9afa3b4eebd67bf13dc7e84c4b6e120de856826e0 -# Date: 2018-01-06 - - 0 0x0402 Ђ (CYRILLIC CAPITAL LETTER DJE) - 1 0x0403 Ѓ (CYRILLIC CAPITAL LETTER GJE) - 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) - 3 0x0453 ѓ (CYRILLIC SMALL LETTER GJE) - 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) - 5 0x2026 … (HORIZONTAL ELLIPSIS) - 6 0x2020 † (DAGGER) - 7 0x2021 ‡ (DOUBLE DAGGER) - 8 0x20AC € (EURO SIGN) - 9 0x2030 ‰ (PER MILLE SIGN) - 10 0x0409 Љ (CYRILLIC CAPITAL LETTER LJE) - 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) - 12 0x040A Њ (CYRILLIC CAPITAL LETTER NJE) - 13 0x040C Ќ (CYRILLIC CAPITAL LETTER KJE) - 14 0x040B Ћ (CYRILLIC CAPITAL LETTER TSHE) - 15 0x040F Џ (CYRILLIC CAPITAL LETTER DZHE) - 16 0x0452 ђ (CYRILLIC SMALL LETTER DJE) - 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) - 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) - 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) - 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) - 21 0x2022 • (BULLET) - 22 0x2013 – (EN DASH) - 23 0x2014 — (EM DASH) - 24 0x0098 ˜ () - 25 0x2122 ™ (TRADE MARK SIGN) - 26 0x0459 љ (CYRILLIC SMALL LETTER LJE) - 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) - 28 0x045A њ (CYRILLIC SMALL LETTER NJE) - 29 0x045C ќ (CYRILLIC SMALL LETTER KJE) - 30 0x045B ћ (CYRILLIC SMALL LETTER TSHE) - 31 0x045F џ (CYRILLIC SMALL LETTER DZHE) - 32 0x00A0   (NO-BREAK SPACE) - 33 0x040E Ў (CYRILLIC CAPITAL LETTER SHORT U) - 34 0x045E ў (CYRILLIC SMALL LETTER SHORT U) - 35 0x0408 Ј (CYRILLIC CAPITAL LETTER JE) - 36 0x00A4 ¤ (CURRENCY SIGN) - 37 0x0490 Ґ (CYRILLIC CAPITAL LETTER GHE WITH UPTURN) - 38 0x00A6 ¦ (BROKEN BAR) - 39 0x00A7 § (SECTION SIGN) - 40 0x0401 Ё (CYRILLIC CAPITAL LETTER IO) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x0404 Є (CYRILLIC CAPITAL LETTER UKRAINIAN IE) - 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 44 0x00AC ¬ (NOT SIGN) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x00AE ® (REGISTERED SIGN) - 47 0x0407 Ї (CYRILLIC CAPITAL LETTER YI) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x0406 І (CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I) - 51 0x0456 і (CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I) - 52 0x0491 ґ (CYRILLIC SMALL LETTER GHE WITH UPTURN) - 53 0x00B5 µ (MICRO SIGN) - 54 0x00B6 ¶ (PILCROW SIGN) - 55 0x00B7 · (MIDDLE DOT) - 56 0x0451 ё (CYRILLIC SMALL LETTER IO) - 57 0x2116 № (NUMERO SIGN) - 58 0x0454 є (CYRILLIC SMALL LETTER UKRAINIAN IE) - 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 60 0x0458 ј (CYRILLIC SMALL LETTER JE) - 61 0x0405 Ѕ (CYRILLIC CAPITAL LETTER DZE) - 62 0x0455 ѕ (CYRILLIC SMALL LETTER DZE) - 63 0x0457 ї (CYRILLIC SMALL LETTER YI) - 64 0x0410 А (CYRILLIC CAPITAL LETTER A) - 65 0x0411 Б (CYRILLIC CAPITAL LETTER BE) - 66 0x0412 В (CYRILLIC CAPITAL LETTER VE) - 67 0x0413 Г (CYRILLIC CAPITAL LETTER GHE) - 68 0x0414 Д (CYRILLIC CAPITAL LETTER DE) - 69 0x0415 Е (CYRILLIC CAPITAL LETTER IE) - 70 0x0416 Ж (CYRILLIC CAPITAL LETTER ZHE) - 71 0x0417 З (CYRILLIC CAPITAL LETTER ZE) - 72 0x0418 И (CYRILLIC CAPITAL LETTER I) - 73 0x0419 Й (CYRILLIC CAPITAL LETTER SHORT I) - 74 0x041A К (CYRILLIC CAPITAL LETTER KA) - 75 0x041B Л (CYRILLIC CAPITAL LETTER EL) - 76 0x041C М (CYRILLIC CAPITAL LETTER EM) - 77 0x041D Н (CYRILLIC CAPITAL LETTER EN) - 78 0x041E О (CYRILLIC CAPITAL LETTER O) - 79 0x041F П (CYRILLIC CAPITAL LETTER PE) - 80 0x0420 Р (CYRILLIC CAPITAL LETTER ER) - 81 0x0421 С (CYRILLIC CAPITAL LETTER ES) - 82 0x0422 Т (CYRILLIC CAPITAL LETTER TE) - 83 0x0423 У (CYRILLIC CAPITAL LETTER U) - 84 0x0424 Ф (CYRILLIC CAPITAL LETTER EF) - 85 0x0425 Х (CYRILLIC CAPITAL LETTER HA) - 86 0x0426 Ц (CYRILLIC CAPITAL LETTER TSE) - 87 0x0427 Ч (CYRILLIC CAPITAL LETTER CHE) - 88 0x0428 Ш (CYRILLIC CAPITAL LETTER SHA) - 89 0x0429 Щ (CYRILLIC CAPITAL LETTER SHCHA) - 90 0x042A Ъ (CYRILLIC CAPITAL LETTER HARD SIGN) - 91 0x042B Ы (CYRILLIC CAPITAL LETTER YERU) - 92 0x042C Ь (CYRILLIC CAPITAL LETTER SOFT SIGN) - 93 0x042D Э (CYRILLIC CAPITAL LETTER E) - 94 0x042E Ю (CYRILLIC CAPITAL LETTER YU) - 95 0x042F Я (CYRILLIC CAPITAL LETTER YA) - 96 0x0430 а (CYRILLIC SMALL LETTER A) - 97 0x0431 б (CYRILLIC SMALL LETTER BE) - 98 0x0432 в (CYRILLIC SMALL LETTER VE) - 99 0x0433 г (CYRILLIC SMALL LETTER GHE) -100 0x0434 д (CYRILLIC SMALL LETTER DE) -101 0x0435 е (CYRILLIC SMALL LETTER IE) -102 0x0436 ж (CYRILLIC SMALL LETTER ZHE) -103 0x0437 з (CYRILLIC SMALL LETTER ZE) -104 0x0438 и (CYRILLIC SMALL LETTER I) -105 0x0439 й (CYRILLIC SMALL LETTER SHORT I) -106 0x043A к (CYRILLIC SMALL LETTER KA) -107 0x043B л (CYRILLIC SMALL LETTER EL) -108 0x043C м (CYRILLIC SMALL LETTER EM) -109 0x043D н (CYRILLIC SMALL LETTER EN) -110 0x043E о (CYRILLIC SMALL LETTER O) -111 0x043F п (CYRILLIC SMALL LETTER PE) -112 0x0440 р (CYRILLIC SMALL LETTER ER) -113 0x0441 с (CYRILLIC SMALL LETTER ES) -114 0x0442 т (CYRILLIC SMALL LETTER TE) -115 0x0443 у (CYRILLIC SMALL LETTER U) -116 0x0444 ф (CYRILLIC SMALL LETTER EF) -117 0x0445 х (CYRILLIC SMALL LETTER HA) -118 0x0446 ц (CYRILLIC SMALL LETTER TSE) -119 0x0447 ч (CYRILLIC SMALL LETTER CHE) -120 0x0448 ш (CYRILLIC SMALL LETTER SHA) -121 0x0449 щ (CYRILLIC SMALL LETTER SHCHA) -122 0x044A ъ (CYRILLIC SMALL LETTER HARD SIGN) -123 0x044B ы (CYRILLIC SMALL LETTER YERU) -124 0x044C ь (CYRILLIC SMALL LETTER SOFT SIGN) -125 0x044D э (CYRILLIC SMALL LETTER E) -126 0x044E ю (CYRILLIC SMALL LETTER YU) -127 0x044F я (CYRILLIC SMALL LETTER YA) diff --git a/sub_crates/text_encoding/encoding_tables/index-windows-1252.txt b/sub_crates/text_encoding/encoding_tables/index-windows-1252.txt deleted file mode 100644 index 56c5a0d..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-windows-1252.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-windows-1252.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: e56d49d9176e9a412283cf29ac9bd613f5620462f2a080a84eceaf974cfa18b7 -# Date: 2018-01-06 - - 0 0x20AC € (EURO SIGN) - 1 0x0081  () - 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) - 3 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) - 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) - 5 0x2026 … (HORIZONTAL ELLIPSIS) - 6 0x2020 † (DAGGER) - 7 0x2021 ‡ (DOUBLE DAGGER) - 8 0x02C6 ˆ (MODIFIER LETTER CIRCUMFLEX ACCENT) - 9 0x2030 ‰ (PER MILLE SIGN) - 10 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) - 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) - 12 0x0152 Œ (LATIN CAPITAL LIGATURE OE) - 13 0x008D  () - 14 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) - 15 0x008F  () - 16 0x0090  () - 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) - 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) - 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) - 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) - 21 0x2022 • (BULLET) - 22 0x2013 – (EN DASH) - 23 0x2014 — (EM DASH) - 24 0x02DC ˜ (SMALL TILDE) - 25 0x2122 ™ (TRADE MARK SIGN) - 26 0x0161 š (LATIN SMALL LETTER S WITH CARON) - 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) - 28 0x0153 œ (LATIN SMALL LIGATURE OE) - 29 0x009D  () - 30 0x017E ž (LATIN SMALL LETTER Z WITH CARON) - 31 0x0178 Ÿ (LATIN CAPITAL LETTER Y WITH DIAERESIS) - 32 0x00A0   (NO-BREAK SPACE) - 33 0x00A1 ¡ (INVERTED EXCLAMATION MARK) - 34 0x00A2 ¢ (CENT SIGN) - 35 0x00A3 £ (POUND SIGN) - 36 0x00A4 ¤ (CURRENCY SIGN) - 37 0x00A5 ¥ (YEN SIGN) - 38 0x00A6 ¦ (BROKEN BAR) - 39 0x00A7 § (SECTION SIGN) - 40 0x00A8 ¨ (DIAERESIS) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x00AA ª (FEMININE ORDINAL INDICATOR) - 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 44 0x00AC ¬ (NOT SIGN) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x00AE ® (REGISTERED SIGN) - 47 0x00AF ¯ (MACRON) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x00B2 ² (SUPERSCRIPT TWO) - 51 0x00B3 ³ (SUPERSCRIPT THREE) - 52 0x00B4 ´ (ACUTE ACCENT) - 53 0x00B5 µ (MICRO SIGN) - 54 0x00B6 ¶ (PILCROW SIGN) - 55 0x00B7 · (MIDDLE DOT) - 56 0x00B8 ¸ (CEDILLA) - 57 0x00B9 ¹ (SUPERSCRIPT ONE) - 58 0x00BA º (MASCULINE ORDINAL INDICATOR) - 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) - 61 0x00BD ½ (VULGAR FRACTION ONE HALF) - 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) - 63 0x00BF ¿ (INVERTED QUESTION MARK) - 64 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) - 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) - 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) - 67 0x00C3 à (LATIN CAPITAL LETTER A WITH TILDE) - 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) - 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) - 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) - 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) - 72 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) - 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) - 74 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) - 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) - 76 0x00CC Ì (LATIN CAPITAL LETTER I WITH GRAVE) - 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) - 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) - 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) - 80 0x00D0 Ð (LATIN CAPITAL LETTER ETH) - 81 0x00D1 Ñ (LATIN CAPITAL LETTER N WITH TILDE) - 82 0x00D2 Ò (LATIN CAPITAL LETTER O WITH GRAVE) - 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) - 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) - 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) - 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) - 87 0x00D7 × (MULTIPLICATION SIGN) - 88 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) - 89 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) - 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) - 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) - 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) - 93 0x00DD Ý (LATIN CAPITAL LETTER Y WITH ACUTE) - 94 0x00DE Þ (LATIN CAPITAL LETTER THORN) - 95 0x00DF ß (LATIN SMALL LETTER SHARP S) - 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) - 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) - 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) - 99 0x00E3 ã (LATIN SMALL LETTER A WITH TILDE) -100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) -101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) -102 0x00E6 æ (LATIN SMALL LETTER AE) -103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) -104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) -105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) -106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) -107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) -108 0x00EC ì (LATIN SMALL LETTER I WITH GRAVE) -109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) -110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) -111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) -112 0x00F0 ð (LATIN SMALL LETTER ETH) -113 0x00F1 ñ (LATIN SMALL LETTER N WITH TILDE) -114 0x00F2 ò (LATIN SMALL LETTER O WITH GRAVE) -115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) -116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) -117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) -118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) -119 0x00F7 ÷ (DIVISION SIGN) -120 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) -121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) -122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) -123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) -124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) -125 0x00FD ý (LATIN SMALL LETTER Y WITH ACUTE) -126 0x00FE þ (LATIN SMALL LETTER THORN) -127 0x00FF ÿ (LATIN SMALL LETTER Y WITH DIAERESIS) diff --git a/sub_crates/text_encoding/encoding_tables/index-windows-1253.txt b/sub_crates/text_encoding/encoding_tables/index-windows-1253.txt deleted file mode 100644 index 9092f57..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-windows-1253.txt +++ /dev/null @@ -1,131 +0,0 @@ -# For details on index index-windows-1253.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: 49fdc881a3488904dd1e8dfba9aef3258454249958b611bcded1d4c981ab5561 -# Date: 2018-01-06 - - 0 0x20AC € (EURO SIGN) - 1 0x0081  () - 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) - 3 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) - 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) - 5 0x2026 … (HORIZONTAL ELLIPSIS) - 6 0x2020 † (DAGGER) - 7 0x2021 ‡ (DOUBLE DAGGER) - 8 0x0088 ˆ () - 9 0x2030 ‰ (PER MILLE SIGN) - 10 0x008A Š () - 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) - 12 0x008C Œ () - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) - 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) - 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) - 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) - 21 0x2022 • (BULLET) - 22 0x2013 – (EN DASH) - 23 0x2014 — (EM DASH) - 24 0x0098 ˜ () - 25 0x2122 ™ (TRADE MARK SIGN) - 26 0x009A š () - 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) - 28 0x009C œ () - 29 0x009D  () - 30 0x009E ž () - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 33 0x0385 ΅ (GREEK DIALYTIKA TONOS) - 34 0x0386 Ά (GREEK CAPITAL LETTER ALPHA WITH TONOS) - 35 0x00A3 £ (POUND SIGN) - 36 0x00A4 ¤ (CURRENCY SIGN) - 37 0x00A5 ¥ (YEN SIGN) - 38 0x00A6 ¦ (BROKEN BAR) - 39 0x00A7 § (SECTION SIGN) - 40 0x00A8 ¨ (DIAERESIS) - 41 0x00A9 © (COPYRIGHT SIGN) - 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 44 0x00AC ¬ (NOT SIGN) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x00AE ® (REGISTERED SIGN) - 47 0x2015 ― (HORIZONTAL BAR) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x00B2 ² (SUPERSCRIPT TWO) - 51 0x00B3 ³ (SUPERSCRIPT THREE) - 52 0x0384 ΄ (GREEK TONOS) - 53 0x00B5 µ (MICRO SIGN) - 54 0x00B6 ¶ (PILCROW SIGN) - 55 0x00B7 · (MIDDLE DOT) - 56 0x0388 Έ (GREEK CAPITAL LETTER EPSILON WITH TONOS) - 57 0x0389 Ή (GREEK CAPITAL LETTER ETA WITH TONOS) - 58 0x038A Ί (GREEK CAPITAL LETTER IOTA WITH TONOS) - 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 60 0x038C Ό (GREEK CAPITAL LETTER OMICRON WITH TONOS) - 61 0x00BD ½ (VULGAR FRACTION ONE HALF) - 62 0x038E Ύ (GREEK CAPITAL LETTER UPSILON WITH TONOS) - 63 0x038F Ώ (GREEK CAPITAL LETTER OMEGA WITH TONOS) - 64 0x0390 ΐ (GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS) - 65 0x0391 Α (GREEK CAPITAL LETTER ALPHA) - 66 0x0392 Β (GREEK CAPITAL LETTER BETA) - 67 0x0393 Γ (GREEK CAPITAL LETTER GAMMA) - 68 0x0394 Δ (GREEK CAPITAL LETTER DELTA) - 69 0x0395 Ε (GREEK CAPITAL LETTER EPSILON) - 70 0x0396 Ζ (GREEK CAPITAL LETTER ZETA) - 71 0x0397 Η (GREEK CAPITAL LETTER ETA) - 72 0x0398 Θ (GREEK CAPITAL LETTER THETA) - 73 0x0399 Ι (GREEK CAPITAL LETTER IOTA) - 74 0x039A Κ (GREEK CAPITAL LETTER KAPPA) - 75 0x039B Λ (GREEK CAPITAL LETTER LAMDA) - 76 0x039C Μ (GREEK CAPITAL LETTER MU) - 77 0x039D Ν (GREEK CAPITAL LETTER NU) - 78 0x039E Ξ (GREEK CAPITAL LETTER XI) - 79 0x039F Ο (GREEK CAPITAL LETTER OMICRON) - 80 0x03A0 Π (GREEK CAPITAL LETTER PI) - 81 0x03A1 Ρ (GREEK CAPITAL LETTER RHO) - 83 0x03A3 Σ (GREEK CAPITAL LETTER SIGMA) - 84 0x03A4 Τ (GREEK CAPITAL LETTER TAU) - 85 0x03A5 Υ (GREEK CAPITAL LETTER UPSILON) - 86 0x03A6 Φ (GREEK CAPITAL LETTER PHI) - 87 0x03A7 Χ (GREEK CAPITAL LETTER CHI) - 88 0x03A8 Ψ (GREEK CAPITAL LETTER PSI) - 89 0x03A9 Ω (GREEK CAPITAL LETTER OMEGA) - 90 0x03AA Ϊ (GREEK CAPITAL LETTER IOTA WITH DIALYTIKA) - 91 0x03AB Ϋ (GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA) - 92 0x03AC ά (GREEK SMALL LETTER ALPHA WITH TONOS) - 93 0x03AD έ (GREEK SMALL LETTER EPSILON WITH TONOS) - 94 0x03AE ή (GREEK SMALL LETTER ETA WITH TONOS) - 95 0x03AF ί (GREEK SMALL LETTER IOTA WITH TONOS) - 96 0x03B0 ΰ (GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS) - 97 0x03B1 α (GREEK SMALL LETTER ALPHA) - 98 0x03B2 β (GREEK SMALL LETTER BETA) - 99 0x03B3 γ (GREEK SMALL LETTER GAMMA) -100 0x03B4 δ (GREEK SMALL LETTER DELTA) -101 0x03B5 ε (GREEK SMALL LETTER EPSILON) -102 0x03B6 ζ (GREEK SMALL LETTER ZETA) -103 0x03B7 η (GREEK SMALL LETTER ETA) -104 0x03B8 θ (GREEK SMALL LETTER THETA) -105 0x03B9 ι (GREEK SMALL LETTER IOTA) -106 0x03BA κ (GREEK SMALL LETTER KAPPA) -107 0x03BB λ (GREEK SMALL LETTER LAMDA) -108 0x03BC μ (GREEK SMALL LETTER MU) -109 0x03BD ν (GREEK SMALL LETTER NU) -110 0x03BE ξ (GREEK SMALL LETTER XI) -111 0x03BF ο (GREEK SMALL LETTER OMICRON) -112 0x03C0 π (GREEK SMALL LETTER PI) -113 0x03C1 ρ (GREEK SMALL LETTER RHO) -114 0x03C2 ς (GREEK SMALL LETTER FINAL SIGMA) -115 0x03C3 σ (GREEK SMALL LETTER SIGMA) -116 0x03C4 τ (GREEK SMALL LETTER TAU) -117 0x03C5 υ (GREEK SMALL LETTER UPSILON) -118 0x03C6 φ (GREEK SMALL LETTER PHI) -119 0x03C7 χ (GREEK SMALL LETTER CHI) -120 0x03C8 ψ (GREEK SMALL LETTER PSI) -121 0x03C9 ω (GREEK SMALL LETTER OMEGA) -122 0x03CA ϊ (GREEK SMALL LETTER IOTA WITH DIALYTIKA) -123 0x03CB ϋ (GREEK SMALL LETTER UPSILON WITH DIALYTIKA) -124 0x03CC ό (GREEK SMALL LETTER OMICRON WITH TONOS) -125 0x03CD ύ (GREEK SMALL LETTER UPSILON WITH TONOS) -126 0x03CE ώ (GREEK SMALL LETTER OMEGA WITH TONOS) diff --git a/sub_crates/text_encoding/encoding_tables/index-windows-1254.txt b/sub_crates/text_encoding/encoding_tables/index-windows-1254.txt deleted file mode 100644 index e8694a7..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-windows-1254.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-windows-1254.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: e80a27adf377438be8ba5bd223875ea56d6a4d47f958cce1c957a2c446825caa -# Date: 2018-01-06 - - 0 0x20AC € (EURO SIGN) - 1 0x0081  () - 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) - 3 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) - 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) - 5 0x2026 … (HORIZONTAL ELLIPSIS) - 6 0x2020 † (DAGGER) - 7 0x2021 ‡ (DOUBLE DAGGER) - 8 0x02C6 ˆ (MODIFIER LETTER CIRCUMFLEX ACCENT) - 9 0x2030 ‰ (PER MILLE SIGN) - 10 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) - 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) - 12 0x0152 Œ (LATIN CAPITAL LIGATURE OE) - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) - 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) - 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) - 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) - 21 0x2022 • (BULLET) - 22 0x2013 – (EN DASH) - 23 0x2014 — (EM DASH) - 24 0x02DC ˜ (SMALL TILDE) - 25 0x2122 ™ (TRADE MARK SIGN) - 26 0x0161 š (LATIN SMALL LETTER S WITH CARON) - 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) - 28 0x0153 œ (LATIN SMALL LIGATURE OE) - 29 0x009D  () - 30 0x009E ž () - 31 0x0178 Ÿ (LATIN CAPITAL LETTER Y WITH DIAERESIS) - 32 0x00A0   (NO-BREAK SPACE) - 33 0x00A1 ¡ (INVERTED EXCLAMATION MARK) - 34 0x00A2 ¢ (CENT SIGN) - 35 0x00A3 £ (POUND SIGN) - 36 0x00A4 ¤ (CURRENCY SIGN) - 37 0x00A5 ¥ (YEN SIGN) - 38 0x00A6 ¦ (BROKEN BAR) - 39 0x00A7 § (SECTION SIGN) - 40 0x00A8 ¨ (DIAERESIS) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x00AA ª (FEMININE ORDINAL INDICATOR) - 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 44 0x00AC ¬ (NOT SIGN) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x00AE ® (REGISTERED SIGN) - 47 0x00AF ¯ (MACRON) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x00B2 ² (SUPERSCRIPT TWO) - 51 0x00B3 ³ (SUPERSCRIPT THREE) - 52 0x00B4 ´ (ACUTE ACCENT) - 53 0x00B5 µ (MICRO SIGN) - 54 0x00B6 ¶ (PILCROW SIGN) - 55 0x00B7 · (MIDDLE DOT) - 56 0x00B8 ¸ (CEDILLA) - 57 0x00B9 ¹ (SUPERSCRIPT ONE) - 58 0x00BA º (MASCULINE ORDINAL INDICATOR) - 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) - 61 0x00BD ½ (VULGAR FRACTION ONE HALF) - 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) - 63 0x00BF ¿ (INVERTED QUESTION MARK) - 64 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) - 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) - 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) - 67 0x00C3 à (LATIN CAPITAL LETTER A WITH TILDE) - 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) - 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) - 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) - 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) - 72 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) - 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) - 74 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) - 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) - 76 0x00CC Ì (LATIN CAPITAL LETTER I WITH GRAVE) - 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) - 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) - 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) - 80 0x011E Ğ (LATIN CAPITAL LETTER G WITH BREVE) - 81 0x00D1 Ñ (LATIN CAPITAL LETTER N WITH TILDE) - 82 0x00D2 Ò (LATIN CAPITAL LETTER O WITH GRAVE) - 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) - 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) - 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) - 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) - 87 0x00D7 × (MULTIPLICATION SIGN) - 88 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) - 89 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) - 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) - 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) - 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) - 93 0x0130 İ (LATIN CAPITAL LETTER I WITH DOT ABOVE) - 94 0x015E Ş (LATIN CAPITAL LETTER S WITH CEDILLA) - 95 0x00DF ß (LATIN SMALL LETTER SHARP S) - 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) - 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) - 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) - 99 0x00E3 ã (LATIN SMALL LETTER A WITH TILDE) -100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) -101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) -102 0x00E6 æ (LATIN SMALL LETTER AE) -103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) -104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) -105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) -106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) -107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) -108 0x00EC ì (LATIN SMALL LETTER I WITH GRAVE) -109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) -110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) -111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) -112 0x011F ğ (LATIN SMALL LETTER G WITH BREVE) -113 0x00F1 ñ (LATIN SMALL LETTER N WITH TILDE) -114 0x00F2 ò (LATIN SMALL LETTER O WITH GRAVE) -115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) -116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) -117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) -118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) -119 0x00F7 ÷ (DIVISION SIGN) -120 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) -121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) -122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) -123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) -124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) -125 0x0131 ı (LATIN SMALL LETTER DOTLESS I) -126 0x015F ş (LATIN SMALL LETTER S WITH CEDILLA) -127 0x00FF ÿ (LATIN SMALL LETTER Y WITH DIAERESIS) diff --git a/sub_crates/text_encoding/encoding_tables/index-windows-1255.txt b/sub_crates/text_encoding/encoding_tables/index-windows-1255.txt deleted file mode 100644 index 2c9deee..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-windows-1255.txt +++ /dev/null @@ -1,124 +0,0 @@ -# For details on index index-windows-1255.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: cd7fb43c97eefa1651084d92d02af53ad668bd848528c18c3b1af5c06b499651 -# Date: 2018-01-06 - - 0 0x20AC € (EURO SIGN) - 1 0x0081  () - 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) - 3 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) - 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) - 5 0x2026 … (HORIZONTAL ELLIPSIS) - 6 0x2020 † (DAGGER) - 7 0x2021 ‡ (DOUBLE DAGGER) - 8 0x02C6 ˆ (MODIFIER LETTER CIRCUMFLEX ACCENT) - 9 0x2030 ‰ (PER MILLE SIGN) - 10 0x008A Š () - 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) - 12 0x008C Œ () - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) - 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) - 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) - 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) - 21 0x2022 • (BULLET) - 22 0x2013 – (EN DASH) - 23 0x2014 — (EM DASH) - 24 0x02DC ˜ (SMALL TILDE) - 25 0x2122 ™ (TRADE MARK SIGN) - 26 0x009A š () - 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) - 28 0x009C œ () - 29 0x009D  () - 30 0x009E ž () - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 33 0x00A1 ¡ (INVERTED EXCLAMATION MARK) - 34 0x00A2 ¢ (CENT SIGN) - 35 0x00A3 £ (POUND SIGN) - 36 0x20AA ₪ (NEW SHEQEL SIGN) - 37 0x00A5 ¥ (YEN SIGN) - 38 0x00A6 ¦ (BROKEN BAR) - 39 0x00A7 § (SECTION SIGN) - 40 0x00A8 ¨ (DIAERESIS) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x00D7 × (MULTIPLICATION SIGN) - 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 44 0x00AC ¬ (NOT SIGN) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x00AE ® (REGISTERED SIGN) - 47 0x00AF ¯ (MACRON) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x00B2 ² (SUPERSCRIPT TWO) - 51 0x00B3 ³ (SUPERSCRIPT THREE) - 52 0x00B4 ´ (ACUTE ACCENT) - 53 0x00B5 µ (MICRO SIGN) - 54 0x00B6 ¶ (PILCROW SIGN) - 55 0x00B7 · (MIDDLE DOT) - 56 0x00B8 ¸ (CEDILLA) - 57 0x00B9 ¹ (SUPERSCRIPT ONE) - 58 0x00F7 ÷ (DIVISION SIGN) - 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) - 61 0x00BD ½ (VULGAR FRACTION ONE HALF) - 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) - 63 0x00BF ¿ (INVERTED QUESTION MARK) - 64 0x05B0 ְ (HEBREW POINT SHEVA) - 65 0x05B1 ֱ (HEBREW POINT HATAF SEGOL) - 66 0x05B2 ֲ (HEBREW POINT HATAF PATAH) - 67 0x05B3 ֳ (HEBREW POINT HATAF QAMATS) - 68 0x05B4 ִ (HEBREW POINT HIRIQ) - 69 0x05B5 ֵ (HEBREW POINT TSERE) - 70 0x05B6 ֶ (HEBREW POINT SEGOL) - 71 0x05B7 ַ (HEBREW POINT PATAH) - 72 0x05B8 ָ (HEBREW POINT QAMATS) - 73 0x05B9 ֹ (HEBREW POINT HOLAM) - 74 0x05BA ֺ (HEBREW POINT HOLAM HASER FOR VAV) - 75 0x05BB ֻ (HEBREW POINT QUBUTS) - 76 0x05BC ּ (HEBREW POINT DAGESH OR MAPIQ) - 77 0x05BD ֽ (HEBREW POINT METEG) - 78 0x05BE ־ (HEBREW PUNCTUATION MAQAF) - 79 0x05BF ֿ (HEBREW POINT RAFE) - 80 0x05C0 ׀ (HEBREW PUNCTUATION PASEQ) - 81 0x05C1 ׁ (HEBREW POINT SHIN DOT) - 82 0x05C2 ׂ (HEBREW POINT SIN DOT) - 83 0x05C3 ׃ (HEBREW PUNCTUATION SOF PASUQ) - 84 0x05F0 װ (HEBREW LIGATURE YIDDISH DOUBLE VAV) - 85 0x05F1 ױ (HEBREW LIGATURE YIDDISH VAV YOD) - 86 0x05F2 ײ (HEBREW LIGATURE YIDDISH DOUBLE YOD) - 87 0x05F3 ׳ (HEBREW PUNCTUATION GERESH) - 88 0x05F4 ״ (HEBREW PUNCTUATION GERSHAYIM) - 96 0x05D0 א (HEBREW LETTER ALEF) - 97 0x05D1 ב (HEBREW LETTER BET) - 98 0x05D2 ג (HEBREW LETTER GIMEL) - 99 0x05D3 ד (HEBREW LETTER DALET) -100 0x05D4 ה (HEBREW LETTER HE) -101 0x05D5 ו (HEBREW LETTER VAV) -102 0x05D6 ז (HEBREW LETTER ZAYIN) -103 0x05D7 ח (HEBREW LETTER HET) -104 0x05D8 ט (HEBREW LETTER TET) -105 0x05D9 י (HEBREW LETTER YOD) -106 0x05DA ך (HEBREW LETTER FINAL KAF) -107 0x05DB כ (HEBREW LETTER KAF) -108 0x05DC ל (HEBREW LETTER LAMED) -109 0x05DD ם (HEBREW LETTER FINAL MEM) -110 0x05DE מ (HEBREW LETTER MEM) -111 0x05DF ן (HEBREW LETTER FINAL NUN) -112 0x05E0 נ (HEBREW LETTER NUN) -113 0x05E1 ס (HEBREW LETTER SAMEKH) -114 0x05E2 ע (HEBREW LETTER AYIN) -115 0x05E3 ף (HEBREW LETTER FINAL PE) -116 0x05E4 פ (HEBREW LETTER PE) -117 0x05E5 ץ (HEBREW LETTER FINAL TSADI) -118 0x05E6 צ (HEBREW LETTER TSADI) -119 0x05E7 ק (HEBREW LETTER QOF) -120 0x05E8 ר (HEBREW LETTER RESH) -121 0x05E9 ש (HEBREW LETTER SHIN) -122 0x05EA ת (HEBREW LETTER TAV) -125 0x200E ‎ (LEFT-TO-RIGHT MARK) -126 0x200F ‏ (RIGHT-TO-LEFT MARK) diff --git a/sub_crates/text_encoding/encoding_tables/index-windows-1256.txt b/sub_crates/text_encoding/encoding_tables/index-windows-1256.txt deleted file mode 100644 index 0ab9736..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-windows-1256.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-windows-1256.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: 161bdb381f16408e8bebcc8f5310c4190af0e359de8d9bbaa3628ce2f0875509 -# Date: 2018-01-06 - - 0 0x20AC € (EURO SIGN) - 1 0x067E پ (ARABIC LETTER PEH) - 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) - 3 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) - 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) - 5 0x2026 … (HORIZONTAL ELLIPSIS) - 6 0x2020 † (DAGGER) - 7 0x2021 ‡ (DOUBLE DAGGER) - 8 0x02C6 ˆ (MODIFIER LETTER CIRCUMFLEX ACCENT) - 9 0x2030 ‰ (PER MILLE SIGN) - 10 0x0679 ٹ (ARABIC LETTER TTEH) - 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) - 12 0x0152 Œ (LATIN CAPITAL LIGATURE OE) - 13 0x0686 چ (ARABIC LETTER TCHEH) - 14 0x0698 ژ (ARABIC LETTER JEH) - 15 0x0688 ڈ (ARABIC LETTER DDAL) - 16 0x06AF گ (ARABIC LETTER GAF) - 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) - 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) - 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) - 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) - 21 0x2022 • (BULLET) - 22 0x2013 – (EN DASH) - 23 0x2014 — (EM DASH) - 24 0x06A9 ک (ARABIC LETTER KEHEH) - 25 0x2122 ™ (TRADE MARK SIGN) - 26 0x0691 ڑ (ARABIC LETTER RREH) - 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) - 28 0x0153 œ (LATIN SMALL LIGATURE OE) - 29 0x200C ‌ (ZERO WIDTH NON-JOINER) - 30 0x200D ‍ (ZERO WIDTH JOINER) - 31 0x06BA ں (ARABIC LETTER NOON GHUNNA) - 32 0x00A0   (NO-BREAK SPACE) - 33 0x060C ، (ARABIC COMMA) - 34 0x00A2 ¢ (CENT SIGN) - 35 0x00A3 £ (POUND SIGN) - 36 0x00A4 ¤ (CURRENCY SIGN) - 37 0x00A5 ¥ (YEN SIGN) - 38 0x00A6 ¦ (BROKEN BAR) - 39 0x00A7 § (SECTION SIGN) - 40 0x00A8 ¨ (DIAERESIS) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x06BE ھ (ARABIC LETTER HEH DOACHASHMEE) - 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 44 0x00AC ¬ (NOT SIGN) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x00AE ® (REGISTERED SIGN) - 47 0x00AF ¯ (MACRON) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x00B2 ² (SUPERSCRIPT TWO) - 51 0x00B3 ³ (SUPERSCRIPT THREE) - 52 0x00B4 ´ (ACUTE ACCENT) - 53 0x00B5 µ (MICRO SIGN) - 54 0x00B6 ¶ (PILCROW SIGN) - 55 0x00B7 · (MIDDLE DOT) - 56 0x00B8 ¸ (CEDILLA) - 57 0x00B9 ¹ (SUPERSCRIPT ONE) - 58 0x061B ؛ (ARABIC SEMICOLON) - 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) - 61 0x00BD ½ (VULGAR FRACTION ONE HALF) - 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) - 63 0x061F ؟ (ARABIC QUESTION MARK) - 64 0x06C1 ہ (ARABIC LETTER HEH GOAL) - 65 0x0621 ء (ARABIC LETTER HAMZA) - 66 0x0622 آ (ARABIC LETTER ALEF WITH MADDA ABOVE) - 67 0x0623 أ (ARABIC LETTER ALEF WITH HAMZA ABOVE) - 68 0x0624 ؤ (ARABIC LETTER WAW WITH HAMZA ABOVE) - 69 0x0625 إ (ARABIC LETTER ALEF WITH HAMZA BELOW) - 70 0x0626 ئ (ARABIC LETTER YEH WITH HAMZA ABOVE) - 71 0x0627 ا (ARABIC LETTER ALEF) - 72 0x0628 ب (ARABIC LETTER BEH) - 73 0x0629 ة (ARABIC LETTER TEH MARBUTA) - 74 0x062A ت (ARABIC LETTER TEH) - 75 0x062B ث (ARABIC LETTER THEH) - 76 0x062C ج (ARABIC LETTER JEEM) - 77 0x062D ح (ARABIC LETTER HAH) - 78 0x062E خ (ARABIC LETTER KHAH) - 79 0x062F د (ARABIC LETTER DAL) - 80 0x0630 ذ (ARABIC LETTER THAL) - 81 0x0631 ر (ARABIC LETTER REH) - 82 0x0632 ز (ARABIC LETTER ZAIN) - 83 0x0633 س (ARABIC LETTER SEEN) - 84 0x0634 ش (ARABIC LETTER SHEEN) - 85 0x0635 ص (ARABIC LETTER SAD) - 86 0x0636 ض (ARABIC LETTER DAD) - 87 0x00D7 × (MULTIPLICATION SIGN) - 88 0x0637 ط (ARABIC LETTER TAH) - 89 0x0638 ظ (ARABIC LETTER ZAH) - 90 0x0639 ع (ARABIC LETTER AIN) - 91 0x063A غ (ARABIC LETTER GHAIN) - 92 0x0640 ـ (ARABIC TATWEEL) - 93 0x0641 ف (ARABIC LETTER FEH) - 94 0x0642 ق (ARABIC LETTER QAF) - 95 0x0643 ك (ARABIC LETTER KAF) - 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) - 97 0x0644 ل (ARABIC LETTER LAM) - 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) - 99 0x0645 م (ARABIC LETTER MEEM) -100 0x0646 ن (ARABIC LETTER NOON) -101 0x0647 ه (ARABIC LETTER HEH) -102 0x0648 و (ARABIC LETTER WAW) -103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) -104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) -105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) -106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) -107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) -108 0x0649 ى (ARABIC LETTER ALEF MAKSURA) -109 0x064A ي (ARABIC LETTER YEH) -110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) -111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) -112 0x064B ً (ARABIC FATHATAN) -113 0x064C ٌ (ARABIC DAMMATAN) -114 0x064D ٍ (ARABIC KASRATAN) -115 0x064E َ (ARABIC FATHA) -116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) -117 0x064F ُ (ARABIC DAMMA) -118 0x0650 ِ (ARABIC KASRA) -119 0x00F7 ÷ (DIVISION SIGN) -120 0x0651 ّ (ARABIC SHADDA) -121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) -122 0x0652 ْ (ARABIC SUKUN) -123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) -124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) -125 0x200E ‎ (LEFT-TO-RIGHT MARK) -126 0x200F ‏ (RIGHT-TO-LEFT MARK) -127 0x06D2 ے (ARABIC LETTER YEH BARREE) diff --git a/sub_crates/text_encoding/encoding_tables/index-windows-1257.txt b/sub_crates/text_encoding/encoding_tables/index-windows-1257.txt deleted file mode 100644 index da72914..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-windows-1257.txt +++ /dev/null @@ -1,132 +0,0 @@ -# For details on index index-windows-1257.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: cc7256bdd10a5b8dc7fb6f994659f307dfcae60def9aa6c29d811f85e2842c47 -# Date: 2018-01-06 - - 0 0x20AC € (EURO SIGN) - 1 0x0081  () - 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) - 3 0x0083 ƒ () - 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) - 5 0x2026 … (HORIZONTAL ELLIPSIS) - 6 0x2020 † (DAGGER) - 7 0x2021 ‡ (DOUBLE DAGGER) - 8 0x0088 ˆ () - 9 0x2030 ‰ (PER MILLE SIGN) - 10 0x008A Š () - 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) - 12 0x008C Œ () - 13 0x00A8 ¨ (DIAERESIS) - 14 0x02C7 ˇ (CARON) - 15 0x00B8 ¸ (CEDILLA) - 16 0x0090  () - 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) - 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) - 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) - 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) - 21 0x2022 • (BULLET) - 22 0x2013 – (EN DASH) - 23 0x2014 — (EM DASH) - 24 0x0098 ˜ () - 25 0x2122 ™ (TRADE MARK SIGN) - 26 0x009A š () - 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) - 28 0x009C œ () - 29 0x00AF ¯ (MACRON) - 30 0x02DB ˛ (OGONEK) - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 34 0x00A2 ¢ (CENT SIGN) - 35 0x00A3 £ (POUND SIGN) - 36 0x00A4 ¤ (CURRENCY SIGN) - 38 0x00A6 ¦ (BROKEN BAR) - 39 0x00A7 § (SECTION SIGN) - 40 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x0156 Ŗ (LATIN CAPITAL LETTER R WITH CEDILLA) - 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 44 0x00AC ¬ (NOT SIGN) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x00AE ® (REGISTERED SIGN) - 47 0x00C6 Æ (LATIN CAPITAL LETTER AE) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x00B2 ² (SUPERSCRIPT TWO) - 51 0x00B3 ³ (SUPERSCRIPT THREE) - 52 0x00B4 ´ (ACUTE ACCENT) - 53 0x00B5 µ (MICRO SIGN) - 54 0x00B6 ¶ (PILCROW SIGN) - 55 0x00B7 · (MIDDLE DOT) - 56 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) - 57 0x00B9 ¹ (SUPERSCRIPT ONE) - 58 0x0157 ŗ (LATIN SMALL LETTER R WITH CEDILLA) - 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) - 61 0x00BD ½ (VULGAR FRACTION ONE HALF) - 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) - 63 0x00E6 æ (LATIN SMALL LETTER AE) - 64 0x0104 Ą (LATIN CAPITAL LETTER A WITH OGONEK) - 65 0x012E Į (LATIN CAPITAL LETTER I WITH OGONEK) - 66 0x0100 Ā (LATIN CAPITAL LETTER A WITH MACRON) - 67 0x0106 Ć (LATIN CAPITAL LETTER C WITH ACUTE) - 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) - 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) - 70 0x0118 Ę (LATIN CAPITAL LETTER E WITH OGONEK) - 71 0x0112 Ē (LATIN CAPITAL LETTER E WITH MACRON) - 72 0x010C Č (LATIN CAPITAL LETTER C WITH CARON) - 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) - 74 0x0179 Ź (LATIN CAPITAL LETTER Z WITH ACUTE) - 75 0x0116 Ė (LATIN CAPITAL LETTER E WITH DOT ABOVE) - 76 0x0122 Ģ (LATIN CAPITAL LETTER G WITH CEDILLA) - 77 0x0136 Ķ (LATIN CAPITAL LETTER K WITH CEDILLA) - 78 0x012A Ī (LATIN CAPITAL LETTER I WITH MACRON) - 79 0x013B Ļ (LATIN CAPITAL LETTER L WITH CEDILLA) - 80 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) - 81 0x0143 Ń (LATIN CAPITAL LETTER N WITH ACUTE) - 82 0x0145 Ņ (LATIN CAPITAL LETTER N WITH CEDILLA) - 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) - 84 0x014C Ō (LATIN CAPITAL LETTER O WITH MACRON) - 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) - 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) - 87 0x00D7 × (MULTIPLICATION SIGN) - 88 0x0172 Ų (LATIN CAPITAL LETTER U WITH OGONEK) - 89 0x0141 Ł (LATIN CAPITAL LETTER L WITH STROKE) - 90 0x015A Ś (LATIN CAPITAL LETTER S WITH ACUTE) - 91 0x016A Ū (LATIN CAPITAL LETTER U WITH MACRON) - 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) - 93 0x017B Ż (LATIN CAPITAL LETTER Z WITH DOT ABOVE) - 94 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) - 95 0x00DF ß (LATIN SMALL LETTER SHARP S) - 96 0x0105 ą (LATIN SMALL LETTER A WITH OGONEK) - 97 0x012F į (LATIN SMALL LETTER I WITH OGONEK) - 98 0x0101 ā (LATIN SMALL LETTER A WITH MACRON) - 99 0x0107 ć (LATIN SMALL LETTER C WITH ACUTE) -100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) -101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) -102 0x0119 ę (LATIN SMALL LETTER E WITH OGONEK) -103 0x0113 ē (LATIN SMALL LETTER E WITH MACRON) -104 0x010D č (LATIN SMALL LETTER C WITH CARON) -105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) -106 0x017A ź (LATIN SMALL LETTER Z WITH ACUTE) -107 0x0117 ė (LATIN SMALL LETTER E WITH DOT ABOVE) -108 0x0123 ģ (LATIN SMALL LETTER G WITH CEDILLA) -109 0x0137 ķ (LATIN SMALL LETTER K WITH CEDILLA) -110 0x012B ī (LATIN SMALL LETTER I WITH MACRON) -111 0x013C ļ (LATIN SMALL LETTER L WITH CEDILLA) -112 0x0161 š (LATIN SMALL LETTER S WITH CARON) -113 0x0144 ń (LATIN SMALL LETTER N WITH ACUTE) -114 0x0146 ņ (LATIN SMALL LETTER N WITH CEDILLA) -115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) -116 0x014D ō (LATIN SMALL LETTER O WITH MACRON) -117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) -118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) -119 0x00F7 ÷ (DIVISION SIGN) -120 0x0173 ų (LATIN SMALL LETTER U WITH OGONEK) -121 0x0142 ł (LATIN SMALL LETTER L WITH STROKE) -122 0x015B ś (LATIN SMALL LETTER S WITH ACUTE) -123 0x016B ū (LATIN SMALL LETTER U WITH MACRON) -124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) -125 0x017C ż (LATIN SMALL LETTER Z WITH DOT ABOVE) -126 0x017E ž (LATIN SMALL LETTER Z WITH CARON) -127 0x02D9 ˙ (DOT ABOVE) diff --git a/sub_crates/text_encoding/encoding_tables/index-windows-1258.txt b/sub_crates/text_encoding/encoding_tables/index-windows-1258.txt deleted file mode 100644 index 141a066..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-windows-1258.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-windows-1258.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: 198bacedfcf24390e219240a7b776b6cec34cff070330b08a601a69c67f7eb24 -# Date: 2018-01-06 - - 0 0x20AC € (EURO SIGN) - 1 0x0081  () - 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) - 3 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) - 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) - 5 0x2026 … (HORIZONTAL ELLIPSIS) - 6 0x2020 † (DAGGER) - 7 0x2021 ‡ (DOUBLE DAGGER) - 8 0x02C6 ˆ (MODIFIER LETTER CIRCUMFLEX ACCENT) - 9 0x2030 ‰ (PER MILLE SIGN) - 10 0x008A Š () - 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) - 12 0x0152 Œ (LATIN CAPITAL LIGATURE OE) - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) - 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) - 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) - 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) - 21 0x2022 • (BULLET) - 22 0x2013 – (EN DASH) - 23 0x2014 — (EM DASH) - 24 0x02DC ˜ (SMALL TILDE) - 25 0x2122 ™ (TRADE MARK SIGN) - 26 0x009A š () - 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) - 28 0x0153 œ (LATIN SMALL LIGATURE OE) - 29 0x009D  () - 30 0x009E ž () - 31 0x0178 Ÿ (LATIN CAPITAL LETTER Y WITH DIAERESIS) - 32 0x00A0   (NO-BREAK SPACE) - 33 0x00A1 ¡ (INVERTED EXCLAMATION MARK) - 34 0x00A2 ¢ (CENT SIGN) - 35 0x00A3 £ (POUND SIGN) - 36 0x00A4 ¤ (CURRENCY SIGN) - 37 0x00A5 ¥ (YEN SIGN) - 38 0x00A6 ¦ (BROKEN BAR) - 39 0x00A7 § (SECTION SIGN) - 40 0x00A8 ¨ (DIAERESIS) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x00AA ª (FEMININE ORDINAL INDICATOR) - 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 44 0x00AC ¬ (NOT SIGN) - 45 0x00AD ­ (SOFT HYPHEN) - 46 0x00AE ® (REGISTERED SIGN) - 47 0x00AF ¯ (MACRON) - 48 0x00B0 ° (DEGREE SIGN) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x00B2 ² (SUPERSCRIPT TWO) - 51 0x00B3 ³ (SUPERSCRIPT THREE) - 52 0x00B4 ´ (ACUTE ACCENT) - 53 0x00B5 µ (MICRO SIGN) - 54 0x00B6 ¶ (PILCROW SIGN) - 55 0x00B7 · (MIDDLE DOT) - 56 0x00B8 ¸ (CEDILLA) - 57 0x00B9 ¹ (SUPERSCRIPT ONE) - 58 0x00BA º (MASCULINE ORDINAL INDICATOR) - 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) - 61 0x00BD ½ (VULGAR FRACTION ONE HALF) - 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) - 63 0x00BF ¿ (INVERTED QUESTION MARK) - 64 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) - 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) - 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) - 67 0x0102 Ă (LATIN CAPITAL LETTER A WITH BREVE) - 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) - 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) - 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) - 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) - 72 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) - 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) - 74 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) - 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) - 76 0x0300 ̀ (COMBINING GRAVE ACCENT) - 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) - 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) - 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) - 80 0x0110 Đ (LATIN CAPITAL LETTER D WITH STROKE) - 81 0x00D1 Ñ (LATIN CAPITAL LETTER N WITH TILDE) - 82 0x0309 ̉ (COMBINING HOOK ABOVE) - 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) - 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) - 85 0x01A0 Ơ (LATIN CAPITAL LETTER O WITH HORN) - 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) - 87 0x00D7 × (MULTIPLICATION SIGN) - 88 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) - 89 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) - 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) - 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) - 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) - 93 0x01AF Ư (LATIN CAPITAL LETTER U WITH HORN) - 94 0x0303 ̃ (COMBINING TILDE) - 95 0x00DF ß (LATIN SMALL LETTER SHARP S) - 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) - 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) - 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) - 99 0x0103 ă (LATIN SMALL LETTER A WITH BREVE) -100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) -101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) -102 0x00E6 æ (LATIN SMALL LETTER AE) -103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) -104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) -105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) -106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) -107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) -108 0x0301 ́ (COMBINING ACUTE ACCENT) -109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) -110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) -111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) -112 0x0111 đ (LATIN SMALL LETTER D WITH STROKE) -113 0x00F1 ñ (LATIN SMALL LETTER N WITH TILDE) -114 0x0323 ̣ (COMBINING DOT BELOW) -115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) -116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) -117 0x01A1 ơ (LATIN SMALL LETTER O WITH HORN) -118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) -119 0x00F7 ÷ (DIVISION SIGN) -120 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) -121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) -122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) -123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) -124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) -125 0x01B0 ư (LATIN SMALL LETTER U WITH HORN) -126 0x20AB ₫ (DONG SIGN) -127 0x00FF ÿ (LATIN SMALL LETTER Y WITH DIAERESIS) diff --git a/sub_crates/text_encoding/encoding_tables/index-windows-874.txt b/sub_crates/text_encoding/encoding_tables/index-windows-874.txt deleted file mode 100644 index 21db6df..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-windows-874.txt +++ /dev/null @@ -1,126 +0,0 @@ -# For details on index index-windows-874.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: b416583ce125e38474381b31b401a98b19ecf2e57e0998e78a1e18b14894905d -# Date: 2018-01-06 - - 0 0x20AC € (EURO SIGN) - 1 0x0081  () - 2 0x0082 ‚ () - 3 0x0083 ƒ () - 4 0x0084 „ () - 5 0x2026 … (HORIZONTAL ELLIPSIS) - 6 0x0086 † () - 7 0x0087 ‡ () - 8 0x0088 ˆ () - 9 0x0089 ‰ () - 10 0x008A Š () - 11 0x008B ‹ () - 12 0x008C Œ () - 13 0x008D  () - 14 0x008E Ž () - 15 0x008F  () - 16 0x0090  () - 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) - 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) - 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) - 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) - 21 0x2022 • (BULLET) - 22 0x2013 – (EN DASH) - 23 0x2014 — (EM DASH) - 24 0x0098 ˜ () - 25 0x0099 ™ () - 26 0x009A š () - 27 0x009B › () - 28 0x009C œ () - 29 0x009D  () - 30 0x009E ž () - 31 0x009F Ÿ () - 32 0x00A0   (NO-BREAK SPACE) - 33 0x0E01 ก (THAI CHARACTER KO KAI) - 34 0x0E02 ข (THAI CHARACTER KHO KHAI) - 35 0x0E03 ฃ (THAI CHARACTER KHO KHUAT) - 36 0x0E04 ค (THAI CHARACTER KHO KHWAI) - 37 0x0E05 ฅ (THAI CHARACTER KHO KHON) - 38 0x0E06 ฆ (THAI CHARACTER KHO RAKHANG) - 39 0x0E07 ง (THAI CHARACTER NGO NGU) - 40 0x0E08 จ (THAI CHARACTER CHO CHAN) - 41 0x0E09 ฉ (THAI CHARACTER CHO CHING) - 42 0x0E0A ช (THAI CHARACTER CHO CHANG) - 43 0x0E0B ซ (THAI CHARACTER SO SO) - 44 0x0E0C ฌ (THAI CHARACTER CHO CHOE) - 45 0x0E0D ญ (THAI CHARACTER YO YING) - 46 0x0E0E ฎ (THAI CHARACTER DO CHADA) - 47 0x0E0F ฏ (THAI CHARACTER TO PATAK) - 48 0x0E10 ฐ (THAI CHARACTER THO THAN) - 49 0x0E11 ฑ (THAI CHARACTER THO NANGMONTHO) - 50 0x0E12 ฒ (THAI CHARACTER THO PHUTHAO) - 51 0x0E13 ณ (THAI CHARACTER NO NEN) - 52 0x0E14 ด (THAI CHARACTER DO DEK) - 53 0x0E15 ต (THAI CHARACTER TO TAO) - 54 0x0E16 ถ (THAI CHARACTER THO THUNG) - 55 0x0E17 ท (THAI CHARACTER THO THAHAN) - 56 0x0E18 ธ (THAI CHARACTER THO THONG) - 57 0x0E19 น (THAI CHARACTER NO NU) - 58 0x0E1A บ (THAI CHARACTER BO BAIMAI) - 59 0x0E1B ป (THAI CHARACTER PO PLA) - 60 0x0E1C ผ (THAI CHARACTER PHO PHUNG) - 61 0x0E1D ฝ (THAI CHARACTER FO FA) - 62 0x0E1E พ (THAI CHARACTER PHO PHAN) - 63 0x0E1F ฟ (THAI CHARACTER FO FAN) - 64 0x0E20 ภ (THAI CHARACTER PHO SAMPHAO) - 65 0x0E21 ม (THAI CHARACTER MO MA) - 66 0x0E22 ย (THAI CHARACTER YO YAK) - 67 0x0E23 ร (THAI CHARACTER RO RUA) - 68 0x0E24 ฤ (THAI CHARACTER RU) - 69 0x0E25 ล (THAI CHARACTER LO LING) - 70 0x0E26 ฦ (THAI CHARACTER LU) - 71 0x0E27 ว (THAI CHARACTER WO WAEN) - 72 0x0E28 ศ (THAI CHARACTER SO SALA) - 73 0x0E29 ษ (THAI CHARACTER SO RUSI) - 74 0x0E2A ส (THAI CHARACTER SO SUA) - 75 0x0E2B ห (THAI CHARACTER HO HIP) - 76 0x0E2C ฬ (THAI CHARACTER LO CHULA) - 77 0x0E2D อ (THAI CHARACTER O ANG) - 78 0x0E2E ฮ (THAI CHARACTER HO NOKHUK) - 79 0x0E2F ฯ (THAI CHARACTER PAIYANNOI) - 80 0x0E30 ะ (THAI CHARACTER SARA A) - 81 0x0E31 ั (THAI CHARACTER MAI HAN-AKAT) - 82 0x0E32 า (THAI CHARACTER SARA AA) - 83 0x0E33 ำ (THAI CHARACTER SARA AM) - 84 0x0E34 ิ (THAI CHARACTER SARA I) - 85 0x0E35 ี (THAI CHARACTER SARA II) - 86 0x0E36 ึ (THAI CHARACTER SARA UE) - 87 0x0E37 ื (THAI CHARACTER SARA UEE) - 88 0x0E38 ุ (THAI CHARACTER SARA U) - 89 0x0E39 ู (THAI CHARACTER SARA UU) - 90 0x0E3A ฺ (THAI CHARACTER PHINTHU) - 95 0x0E3F ฿ (THAI CURRENCY SYMBOL BAHT) - 96 0x0E40 เ (THAI CHARACTER SARA E) - 97 0x0E41 แ (THAI CHARACTER SARA AE) - 98 0x0E42 โ (THAI CHARACTER SARA O) - 99 0x0E43 ใ (THAI CHARACTER SARA AI MAIMUAN) -100 0x0E44 ไ (THAI CHARACTER SARA AI MAIMALAI) -101 0x0E45 ๅ (THAI CHARACTER LAKKHANGYAO) -102 0x0E46 ๆ (THAI CHARACTER MAIYAMOK) -103 0x0E47 ็ (THAI CHARACTER MAITAIKHU) -104 0x0E48 ่ (THAI CHARACTER MAI EK) -105 0x0E49 ้ (THAI CHARACTER MAI THO) -106 0x0E4A ๊ (THAI CHARACTER MAI TRI) -107 0x0E4B ๋ (THAI CHARACTER MAI CHATTAWA) -108 0x0E4C ์ (THAI CHARACTER THANTHAKHAT) -109 0x0E4D ํ (THAI CHARACTER NIKHAHIT) -110 0x0E4E ๎ (THAI CHARACTER YAMAKKAN) -111 0x0E4F ๏ (THAI CHARACTER FONGMAN) -112 0x0E50 ๐ (THAI DIGIT ZERO) -113 0x0E51 ๑ (THAI DIGIT ONE) -114 0x0E52 ๒ (THAI DIGIT TWO) -115 0x0E53 ๓ (THAI DIGIT THREE) -116 0x0E54 ๔ (THAI DIGIT FOUR) -117 0x0E55 ๕ (THAI DIGIT FIVE) -118 0x0E56 ๖ (THAI DIGIT SIX) -119 0x0E57 ๗ (THAI DIGIT SEVEN) -120 0x0E58 ๘ (THAI DIGIT EIGHT) -121 0x0E59 ๙ (THAI DIGIT NINE) -122 0x0E5A ๚ (THAI CHARACTER ANGKHANKHU) -123 0x0E5B ๛ (THAI CHARACTER KHOMUT) diff --git a/sub_crates/text_encoding/encoding_tables/index-x-mac-cyrillic.txt b/sub_crates/text_encoding/encoding_tables/index-x-mac-cyrillic.txt deleted file mode 100644 index de05e25..0000000 --- a/sub_crates/text_encoding/encoding_tables/index-x-mac-cyrillic.txt +++ /dev/null @@ -1,134 +0,0 @@ -# For details on index index-x-mac-cyrillic.txt see the Encoding Standard -# https://encoding.spec.whatwg.org/ -# -# Identifier: 73e8e7642c6fa9de29d42819b47fba55b58666fb1e339faeb4a89a0bd7c24d43 -# Date: 2018-01-06 - - 0 0x0410 А (CYRILLIC CAPITAL LETTER A) - 1 0x0411 Б (CYRILLIC CAPITAL LETTER BE) - 2 0x0412 В (CYRILLIC CAPITAL LETTER VE) - 3 0x0413 Г (CYRILLIC CAPITAL LETTER GHE) - 4 0x0414 Д (CYRILLIC CAPITAL LETTER DE) - 5 0x0415 Е (CYRILLIC CAPITAL LETTER IE) - 6 0x0416 Ж (CYRILLIC CAPITAL LETTER ZHE) - 7 0x0417 З (CYRILLIC CAPITAL LETTER ZE) - 8 0x0418 И (CYRILLIC CAPITAL LETTER I) - 9 0x0419 Й (CYRILLIC CAPITAL LETTER SHORT I) - 10 0x041A К (CYRILLIC CAPITAL LETTER KA) - 11 0x041B Л (CYRILLIC CAPITAL LETTER EL) - 12 0x041C М (CYRILLIC CAPITAL LETTER EM) - 13 0x041D Н (CYRILLIC CAPITAL LETTER EN) - 14 0x041E О (CYRILLIC CAPITAL LETTER O) - 15 0x041F П (CYRILLIC CAPITAL LETTER PE) - 16 0x0420 Р (CYRILLIC CAPITAL LETTER ER) - 17 0x0421 С (CYRILLIC CAPITAL LETTER ES) - 18 0x0422 Т (CYRILLIC CAPITAL LETTER TE) - 19 0x0423 У (CYRILLIC CAPITAL LETTER U) - 20 0x0424 Ф (CYRILLIC CAPITAL LETTER EF) - 21 0x0425 Х (CYRILLIC CAPITAL LETTER HA) - 22 0x0426 Ц (CYRILLIC CAPITAL LETTER TSE) - 23 0x0427 Ч (CYRILLIC CAPITAL LETTER CHE) - 24 0x0428 Ш (CYRILLIC CAPITAL LETTER SHA) - 25 0x0429 Щ (CYRILLIC CAPITAL LETTER SHCHA) - 26 0x042A Ъ (CYRILLIC CAPITAL LETTER HARD SIGN) - 27 0x042B Ы (CYRILLIC CAPITAL LETTER YERU) - 28 0x042C Ь (CYRILLIC CAPITAL LETTER SOFT SIGN) - 29 0x042D Э (CYRILLIC CAPITAL LETTER E) - 30 0x042E Ю (CYRILLIC CAPITAL LETTER YU) - 31 0x042F Я (CYRILLIC CAPITAL LETTER YA) - 32 0x2020 † (DAGGER) - 33 0x00B0 ° (DEGREE SIGN) - 34 0x0490 Ґ (CYRILLIC CAPITAL LETTER GHE WITH UPTURN) - 35 0x00A3 £ (POUND SIGN) - 36 0x00A7 § (SECTION SIGN) - 37 0x2022 • (BULLET) - 38 0x00B6 ¶ (PILCROW SIGN) - 39 0x0406 І (CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I) - 40 0x00AE ® (REGISTERED SIGN) - 41 0x00A9 © (COPYRIGHT SIGN) - 42 0x2122 ™ (TRADE MARK SIGN) - 43 0x0402 Ђ (CYRILLIC CAPITAL LETTER DJE) - 44 0x0452 ђ (CYRILLIC SMALL LETTER DJE) - 45 0x2260 ≠ (NOT EQUAL TO) - 46 0x0403 Ѓ (CYRILLIC CAPITAL LETTER GJE) - 47 0x0453 ѓ (CYRILLIC SMALL LETTER GJE) - 48 0x221E ∞ (INFINITY) - 49 0x00B1 ± (PLUS-MINUS SIGN) - 50 0x2264 ≤ (LESS-THAN OR EQUAL TO) - 51 0x2265 ≥ (GREATER-THAN OR EQUAL TO) - 52 0x0456 і (CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I) - 53 0x00B5 µ (MICRO SIGN) - 54 0x0491 ґ (CYRILLIC SMALL LETTER GHE WITH UPTURN) - 55 0x0408 Ј (CYRILLIC CAPITAL LETTER JE) - 56 0x0404 Є (CYRILLIC CAPITAL LETTER UKRAINIAN IE) - 57 0x0454 є (CYRILLIC SMALL LETTER UKRAINIAN IE) - 58 0x0407 Ї (CYRILLIC CAPITAL LETTER YI) - 59 0x0457 ї (CYRILLIC SMALL LETTER YI) - 60 0x0409 Љ (CYRILLIC CAPITAL LETTER LJE) - 61 0x0459 љ (CYRILLIC SMALL LETTER LJE) - 62 0x040A Њ (CYRILLIC CAPITAL LETTER NJE) - 63 0x045A њ (CYRILLIC SMALL LETTER NJE) - 64 0x0458 ј (CYRILLIC SMALL LETTER JE) - 65 0x0405 Ѕ (CYRILLIC CAPITAL LETTER DZE) - 66 0x00AC ¬ (NOT SIGN) - 67 0x221A √ (SQUARE ROOT) - 68 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) - 69 0x2248 ≈ (ALMOST EQUAL TO) - 70 0x2206 ∆ (INCREMENT) - 71 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) - 72 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) - 73 0x2026 … (HORIZONTAL ELLIPSIS) - 74 0x00A0   (NO-BREAK SPACE) - 75 0x040B Ћ (CYRILLIC CAPITAL LETTER TSHE) - 76 0x045B ћ (CYRILLIC SMALL LETTER TSHE) - 77 0x040C Ќ (CYRILLIC CAPITAL LETTER KJE) - 78 0x045C ќ (CYRILLIC SMALL LETTER KJE) - 79 0x0455 ѕ (CYRILLIC SMALL LETTER DZE) - 80 0x2013 – (EN DASH) - 81 0x2014 — (EM DASH) - 82 0x201C “ (LEFT DOUBLE QUOTATION MARK) - 83 0x201D ” (RIGHT DOUBLE QUOTATION MARK) - 84 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) - 85 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) - 86 0x00F7 ÷ (DIVISION SIGN) - 87 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) - 88 0x040E Ў (CYRILLIC CAPITAL LETTER SHORT U) - 89 0x045E ў (CYRILLIC SMALL LETTER SHORT U) - 90 0x040F Џ (CYRILLIC CAPITAL LETTER DZHE) - 91 0x045F џ (CYRILLIC SMALL LETTER DZHE) - 92 0x2116 № (NUMERO SIGN) - 93 0x0401 Ё (CYRILLIC CAPITAL LETTER IO) - 94 0x0451 ё (CYRILLIC SMALL LETTER IO) - 95 0x044F я (CYRILLIC SMALL LETTER YA) - 96 0x0430 а (CYRILLIC SMALL LETTER A) - 97 0x0431 б (CYRILLIC SMALL LETTER BE) - 98 0x0432 в (CYRILLIC SMALL LETTER VE) - 99 0x0433 г (CYRILLIC SMALL LETTER GHE) -100 0x0434 д (CYRILLIC SMALL LETTER DE) -101 0x0435 е (CYRILLIC SMALL LETTER IE) -102 0x0436 ж (CYRILLIC SMALL LETTER ZHE) -103 0x0437 з (CYRILLIC SMALL LETTER ZE) -104 0x0438 и (CYRILLIC SMALL LETTER I) -105 0x0439 й (CYRILLIC SMALL LETTER SHORT I) -106 0x043A к (CYRILLIC SMALL LETTER KA) -107 0x043B л (CYRILLIC SMALL LETTER EL) -108 0x043C м (CYRILLIC SMALL LETTER EM) -109 0x043D н (CYRILLIC SMALL LETTER EN) -110 0x043E о (CYRILLIC SMALL LETTER O) -111 0x043F п (CYRILLIC SMALL LETTER PE) -112 0x0440 р (CYRILLIC SMALL LETTER ER) -113 0x0441 с (CYRILLIC SMALL LETTER ES) -114 0x0442 т (CYRILLIC SMALL LETTER TE) -115 0x0443 у (CYRILLIC SMALL LETTER U) -116 0x0444 ф (CYRILLIC SMALL LETTER EF) -117 0x0445 х (CYRILLIC SMALL LETTER HA) -118 0x0446 ц (CYRILLIC SMALL LETTER TSE) -119 0x0447 ч (CYRILLIC SMALL LETTER CHE) -120 0x0448 ш (CYRILLIC SMALL LETTER SHA) -121 0x0449 щ (CYRILLIC SMALL LETTER SHCHA) -122 0x044A ъ (CYRILLIC SMALL LETTER HARD SIGN) -123 0x044B ы (CYRILLIC SMALL LETTER YERU) -124 0x044C ь (CYRILLIC SMALL LETTER SOFT SIGN) -125 0x044D э (CYRILLIC SMALL LETTER E) -126 0x044E ю (CYRILLIC SMALL LETTER YU) -127 0x20AC € (EURO SIGN) diff --git a/sub_crates/text_encoding/src/latin1.rs b/sub_crates/text_encoding/src/latin1.rs deleted file mode 100644 index ea36d37..0000000 --- a/sub_crates/text_encoding/src/latin1.rs +++ /dev/null @@ -1,212 +0,0 @@ -//! Encoding/decoding functions for ISO/IEC 8859-1 (or "latin1"), which -//! conveniently happens to map 1-to-1 to the first 256 unicode scalar values. -//! -//! Because latin1 is a single-byte encoding where all bytes are valid, -//! decoding cannot fail. However, encoding will fail with scalar values -//! greater than 255. - -use core; -use {DecodeResult, EncodeError, EncodeResult}; - -pub fn encode_from_str<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { - // Do the encode. - let mut input_i = 0; - let mut output_i = 0; - for (offset, c) in input.char_indices() { - if output_i >= output.len() { - break; - } - if c as u32 > 255 { - return Err(EncodeError { - character: c, - error_range: (offset, offset + c.len_utf8()), - output_bytes_written: output_i, - }); - } - output[output_i] = c as u8; - output_i += 1; - input_i = offset + 1; - } - - // Calculate how much of the input was consumed. - if input_i > input.len() { - input_i = input.len(); - } else { - while !input.is_char_boundary(input_i) { - input_i += 1; - } - } - - Ok((input_i, &output[..output_i])) -} - -pub fn decode_to_str<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { - let mut input_i = 0; - let mut output_i = 0; - for &byte in input.iter() { - if byte <= 127 { - // 1-byte case - if output_i >= output.len() { - break; - } - output[output_i] = byte; - input_i += 1; - output_i += 1; - } else { - // 2-byte case - if (output_i + 1) >= output.len() { - break; - } - output[output_i] = 0b11000000 | (byte >> 6); - output[output_i + 1] = 0b10000000 | (byte & 0b00111111); - input_i += 1; - output_i += 2; - } - } - - Ok((input_i, unsafe { - core::str::from_utf8_unchecked(&output[..output_i]) - })) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn encode_01() { - let text = "Hello world!"; - let mut buf = [0u8; 0]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(encoded, &[]); - } - - #[test] - fn encode_02() { - let text = "Hello world!"; - let mut buf = [0u8; 1]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 1); - assert_eq!(encoded, "H".as_bytes()); - } - - #[test] - fn encode_03() { - let text = "Hello world!"; - let mut buf = [0u8; 2]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 2); - assert_eq!(encoded, "He".as_bytes()); - } - - #[test] - fn encode_04() { - let text = "Hello world!"; - let mut buf = [0u8; 64]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 12); - assert_eq!(encoded, "Hello world!".as_bytes()); - } - - #[test] - fn encode_05() { - let text = "Hello world!こ"; - let mut buf = [0u8; 12]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 12); - assert_eq!(encoded, "Hello world!".as_bytes()); - } - - #[test] - fn decode_01() { - let data = "Hello world!".as_bytes(); - let mut buf = [0u8; 0]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(decoded, ""); - } - - #[test] - fn decode_02() { - let data = "Hello world!".as_bytes(); - let mut buf = [0u8; 1]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 1); - assert_eq!(decoded, "H"); - } - - #[test] - fn decode_03() { - let data = "Hello world!".as_bytes(); - let mut buf = [0u8; 2]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 2); - assert_eq!(decoded, "He"); - } - - #[test] - fn decode_04() { - let data = "Hello world!".as_bytes(); - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 12); - assert_eq!(decoded, "Hello world!"); - } - - #[test] - fn encode_error_01() { - let text = "こello world!"; - let mut buf = [0u8; 64]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: 'こ', - error_range: (0, 3), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn encode_error_02() { - let text = "Hこllo world!"; - let mut buf = [0u8; 64]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: 'こ', - error_range: (1, 4), - output_bytes_written: 1, - }) - ); - } - - #[test] - fn encode_error_03() { - let text = "Heこlo world!"; - let mut buf = [0u8; 64]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: 'こ', - error_range: (2, 5), - output_bytes_written: 2, - }) - ); - } - - #[test] - fn encode_error_04() { - let text = "Heこlo world!"; - let mut buf = [0u8; 3]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: 'こ', - error_range: (2, 5), - output_bytes_written: 2, - }) - ); - } -} diff --git a/sub_crates/text_encoding/src/lib.rs b/sub_crates/text_encoding/src/lib.rs deleted file mode 100644 index 9114eb1..0000000 --- a/sub_crates/text_encoding/src/lib.rs +++ /dev/null @@ -1,122 +0,0 @@ -#![no_std] - -//! A library for incrementally encoding/decoding between utf8 and various -//! text encodings. - -mod latin1; -mod single_byte; -mod utf16_be; -mod utf16_le; -mod utf32_be; -mod utf32_le; -mod utf8; -mod utils; - -use single_byte::{ibm866, iso_8859_2, iso_8859_7, windows1252}; - -/// Encodes text from utf8 to a destination encoding. -pub fn encode_from_str<'a>( - output_encoding: Encoding, - input: &str, - output: &'a mut [u8], -) -> EncodeResult<'a> { - match output_encoding { - Encoding::Utf8 => utf8::encode_from_str(input, output), - Encoding::Utf16BE => utf16_be::encode_from_str(input, output), - Encoding::Utf16LE => utf16_le::encode_from_str(input, output), - Encoding::Utf32BE => utf32_be::encode_from_str(input, output), - Encoding::Utf32LE => utf32_le::encode_from_str(input, output), - Encoding::IBM866 => ibm866::encode_from_str(input, output), - Encoding::Latin1 => latin1::encode_from_str(input, output), - Encoding::ISO8859_2 => iso_8859_2::encode_from_str(input, output), - Encoding::ISO8859_7 => iso_8859_7::encode_from_str(input, output), - Encoding::Windows1252 => windows1252::encode_from_str(input, output), - } -} - -/// Decodes text from a source encoding to utf8. -pub fn decode_to_str<'a>( - input_encoding: Encoding, - input: &[u8], - output: &'a mut [u8], -) -> DecodeResult<'a> { - match input_encoding { - Encoding::Utf8 => utf8::decode_to_str(input, output), - Encoding::Utf16BE => utf16_be::decode_to_str(input, output), - Encoding::Utf16LE => utf16_le::decode_to_str(input, output), - Encoding::Utf32BE => utf32_be::decode_to_str(input, output), - Encoding::Utf32LE => utf32_le::decode_to_str(input, output), - Encoding::IBM866 => ibm866::decode_to_str(input, output), - Encoding::Latin1 => latin1::decode_to_str(input, output), - Encoding::ISO8859_2 => iso_8859_2::decode_to_str(input, output), - Encoding::ISO8859_7 => iso_8859_7::decode_to_str(input, output), - Encoding::Windows1252 => windows1252::decode_to_str(input, output), - } -} - -/// Describes a text encoding. -#[derive(Debug, Copy, Clone)] -pub enum Encoding { - Utf8, - Utf16BE, // Big endian - Utf16LE, // Little endian - Utf32BE, // Big endian - Utf32LE, // Little endian - // ShiftJIS, - // EUC_JP, - // Big5, - IBM866, // IBM 866 - Latin1, // ISO/IEC 8859-1 - ISO8859_2, // ISO/IEC 8859-2 - ISO8859_7, // ISO/IEC 8859-7 - Windows1252, // Windows code page 1252 -} - -/// Result type for encoding text from utf8 to a target encoding. -/// -/// The Ok() variant provides the number of bytes consumed and a reference -/// to the valid encoded text data. -pub type EncodeResult<'a> = Result<(usize, &'a [u8]), EncodeError>; - -/// Result type for decoding text from a target encoding to utf8. -/// -/// The Ok() variant provides the number of bytes consumed and a reference -/// to the valid decoded text. -pub type DecodeResult<'a> = Result<(usize, &'a str), DecodeError>; - -/// Represents an error when encoding from utf8 to some other format. -/// -/// Since valid input utf8 is statically assumed, the only possible -/// error is encountering a char that is not representable in the target -/// encoding. -/// -/// The problematic character, the byte index range of that character in the -/// input utf8, and the number of bytes already written to the output buffer -/// are provided. -/// -/// It is guaranteed that all input leading up to the problem character has -/// already been encoded and written to the output buffer. -#[derive(Debug, Copy, Clone, PartialEq)] -pub struct EncodeError { - pub character: char, - pub error_range: (usize, usize), - pub output_bytes_written: usize, -} - -/// Represents an error when decoding to utf8 from some other format. -/// -/// All supported text encodings can be fully represented in utf8, and -/// therefore the only possible error is that we encounter bytes in the -/// input data that are invalid for the text encoding we're attempting -/// to decode from. -/// -/// The byte index range of the invalid input data and the number of bytes -/// already encoded and written to the output buffer are provided. -/// -/// It is guaranteed that all input leading up to the invalid data has -/// already been encoded and written to the output buffer. -#[derive(Debug, Copy, Clone, PartialEq)] -pub struct DecodeError { - pub error_range: (usize, usize), - pub output_bytes_written: usize, -} diff --git a/sub_crates/text_encoding/src/single_byte.rs b/sub_crates/text_encoding/src/single_byte.rs deleted file mode 100644 index 0a1ccc7..0000000 --- a/sub_crates/text_encoding/src/single_byte.rs +++ /dev/null @@ -1,475 +0,0 @@ -//! Single byte encodings that extend ascii. Their code is auto-generated -//! by build.rs - -use core; -use {DecodeError, DecodeResult, EncodeError, EncodeResult}; - -pub mod ibm866 { - // Generated by build.rs - include!(concat!(env!("OUT_DIR"), "/ibm866.rs")); -} - -pub mod iso_8859_2 { - // Generated by build.rs - include!(concat!(env!("OUT_DIR"), "/iso-8859-2.rs")); -} - -// pub mod iso_8859_3 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/iso-8859-3.rs")); -// } - -// pub mod iso_8859_4 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/iso-8859-4.rs")); -// } - -// pub mod iso_8859_5 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/iso-8859-5.rs")); -// } - -// pub mod iso_8859_6 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/iso-8859-6.rs")); -// } - -pub mod iso_8859_7 { - // Generated by build.rs - include!(concat!(env!("OUT_DIR"), "/iso-8859-7.rs")); -} - -// pub mod iso_8859_8 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/iso-8859-8.rs")); -// } - -// pub mod iso_8859_10 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/iso-8859-10.rs")); -// } - -// pub mod iso_8859_13 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/iso-8859-13.rs")); -// } - -// pub mod iso_8859_14 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/iso-8859-14.rs")); -// } - -// pub mod iso_8859_15 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/iso-8859-15.rs")); -// } - -// pub mod iso_8859_16 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/iso-8859-16.rs")); -// } - -// pub mod koi8_r { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/koi8-r.rs")); -// } - -// pub mod koi8_u { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/koi8-u.rs")); -// } - -// pub mod macintosh { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/macintosh.rs")); -// } - -// pub mod windows874 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/windows-874.rs")); -// } - -// pub mod windows1250 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/windows-1250.rs")); -// } - -// pub mod windows1251 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/windows-1251.rs")); -// } - -pub mod windows1252 { - // Generated by build.rs - include!(concat!(env!("OUT_DIR"), "/windows-1252.rs")); -} - -// pub mod windows1253 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/windows-1253.rs")); -// } - -// pub mod windows1254 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/windows-1254.rs")); -// } - -// pub mod windows1255 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/windows-1255.rs")); -// } - -// pub mod windows1256 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/windows-1256.rs")); -// } - -// pub mod windows1257 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/windows-1257.rs")); -// } - -// pub mod windows1258 { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/windows-1258.rs")); -// } - -// pub mod x_mac_cyrillic { -// // Generated by build.rs -// include!(concat!(env!("OUT_DIR"), "/x-mac-cyrillic.rs")); -// } - -/// This is shared among the single byte encoders, and is shallowly -/// wrapped in each of their modules. -#[inline] -fn single_byte_encode_from_str<'a>( - table: &[(char, u8)], - input: &str, - output: &'a mut [u8], -) -> EncodeResult<'a> { - // Do the encode. - let mut input_i = 0; - let mut output_i = 0; - for (offset, c) in input.char_indices() { - if output_i >= output.len() { - break; - } - if c as u32 <= 127 { - output[output_i] = c as u8; - output_i += 1; - input_i = offset + 1; - } else { - if let Ok(i) = table.binary_search_by_key(&c, |x| x.0) { - output[output_i] = table[i].1; - output_i += 1; - input_i = offset + 1; - } else { - return Err(EncodeError { - character: c, - error_range: (offset, offset + c.len_utf8()), - output_bytes_written: output_i, - }); - } - } - } - - // Calculate how much of the input was consumed. - if input_i > input.len() { - input_i = input.len(); - } else { - while !input.is_char_boundary(input_i) { - input_i += 1; - } - } - - Ok((input_i, &output[..output_i])) -} - -/// This is shared among the single byte decoders, and is shallowly -/// wrapped in each of their modules. -#[inline] -fn single_byte_decode_to_str<'a>( - table: &[char; 128], - input: &[u8], - output: &'a mut [u8], -) -> DecodeResult<'a> { - let mut input_i = 0; - let mut output_i = 0; - for &byte in input.iter() { - if byte < 0x80 { - // 1-byte case - if output_i >= output.len() { - break; - } - output[output_i] = byte; - input_i += 1; - output_i += 1; - } else { - // Use lookup table. - let code = table[byte as usize - 0x80]; - if code == '�' { - // Error: undefined byte. - return Err(DecodeError { - error_range: (input_i, input_i + 1), - output_bytes_written: output_i, - }); - } - // Encode to utf8 - let mut buf = [0u8; 4]; - let s = code.encode_utf8(&mut buf); - if (output_i + s.len()) > output.len() { - break; - } - output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes()); - input_i += 1; - output_i += s.len(); - } - } - - Ok((input_i, unsafe { - core::str::from_utf8_unchecked(&output[..output_i]) - })) -} - -//=========================================================================== - -// Testing is done with iso-8859-7, since it has a few undefined characters, -// allowing us to test handling of those. -#[cfg(test)] -mod tests { - use super::iso_8859_7::*; - use {DecodeError, EncodeError}; - - #[test] - fn encode_01() { - let text = "Hello world!"; - let mut buf = [0u8; 0]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(encoded, &[]); - } - - #[test] - fn encode_02() { - let text = "Hello world!"; - let mut buf = [0u8; 1]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 1); - assert_eq!(encoded, "H".as_bytes()); - } - - #[test] - fn encode_03() { - let text = "Hello world!"; - let mut buf = [0u8; 2]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 2); - assert_eq!(encoded, "He".as_bytes()); - } - - #[test] - fn encode_04() { - let text = "Hello world!"; - let mut buf = [0u8; 64]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 12); - assert_eq!(encoded, "Hello world!".as_bytes()); - } - - #[test] - fn encode_05() { - let text = "Hello world!こ"; - let mut buf = [0u8; 12]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 12); - assert_eq!(encoded, "Hello world!".as_bytes()); - } - - #[test] - fn decode_01() { - let data = [ - 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" - let mut buf = [0u8; 0]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(decoded, ""); - } - - #[test] - fn decode_02() { - let data = [ - 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" - let mut buf = [0u8; 1]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 1); - assert_eq!(decoded, "H"); - } - - #[test] - fn decode_03() { - let data = [ - 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" - let mut buf = [0u8; 2]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 2); - assert_eq!(decoded, "He"); - } - - #[test] - fn decode_04() { - let data = [ - 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 12); - assert_eq!(decoded, "Hello world!"); - } - - #[test] - fn decode_05() { - let data = [ - 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, - 0xCF, 0xD0, 0xD1, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, - ]; // "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ" - let mut buf = [0u8; 128]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 24); - assert_eq!(decoded, "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ"); - } - - #[test] - fn encode_error_01() { - let text = "こello world!"; - let mut buf = [0u8; 64]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: 'こ', - error_range: (0, 3), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn encode_error_02() { - let text = "\u{00C0}ello world!"; - let mut buf = [0u8; 64]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: '\u{00C0}', - error_range: (0, 2), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn encode_error_03() { - let text = "Hこllo world!"; - let mut buf = [0u8; 64]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: 'こ', - error_range: (1, 4), - output_bytes_written: 1, - }) - ); - } - - #[test] - fn encode_error_04() { - let text = "H\u{00C0}llo world!"; - let mut buf = [0u8; 64]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: '\u{00C0}', - error_range: (1, 3), - output_bytes_written: 1, - }) - ); - } - - #[test] - fn encode_error_05() { - let text = "Heこlo world!"; - let mut buf = [0u8; 3]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: 'こ', - error_range: (2, 5), - output_bytes_written: 2, - }) - ); - } - - #[test] - fn encode_error_06() { - let text = "He\u{00C0}lo world!"; - let mut buf = [0u8; 3]; - assert_eq!( - encode_from_str(text, &mut buf), - Err(EncodeError { - character: '\u{00C0}', - error_range: (2, 4), - output_bytes_written: 2, - }) - ); - } - - #[test] - fn decode_error_01() { - let data = [ - 0x48, 0xAE, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" with an error on the second byte (undefined byte). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (1, 2), - output_bytes_written: 1, - }) - ); - } - - #[test] - fn decode_error_02() { - let data = [ - 0x48, 0xD2, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" with an error on the second byte (undefined byte). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (1, 2), - output_bytes_written: 1, - }) - ); - } - - #[test] - fn decode_error_03() { - let data = [ - 0x48, 0xFF, 0x6C, 0x6C, 0x6F, 0x20, 0x77, 0x6F, 0x72, 0x6C, 0x64, 0x21, - ]; // "Hello world!" with an error on the second byte (undefined byte). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (1, 2), - output_bytes_written: 1, - }) - ); - } -} diff --git a/sub_crates/text_encoding/src/utf16_be.rs b/sub_crates/text_encoding/src/utf16_be.rs deleted file mode 100644 index 7f1ef4f..0000000 --- a/sub_crates/text_encoding/src/utf16_be.rs +++ /dev/null @@ -1,362 +0,0 @@ -//! Encoding/decoding functions for big-endian UTF-16. -//! -//! Because both utf8 and utf16 can represent the entirety of unicode, the -//! only possible error is when invalid utf16 is encountered when decoding -//! to utf8. - -use core; -use utils::{from_big_endian_u16, to_big_endian_u16}; -use {DecodeError, DecodeResult, EncodeResult}; - -pub fn encode_from_str<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { - // Do the encode. - let mut input_i = 0; - let mut output_i = 0; - for (offset, c) in input.char_indices() { - let mut code = c as u32; - if code <= 0xFFFF { - // One code unit - if (output_i + 1) < output.len() { - let val = to_big_endian_u16(code as u16); - output[output_i] = val[0]; - output[output_i + 1] = val[1]; - output_i += 2; - input_i = offset + 1; - } else { - break; - } - } else if (output_i + 3) < output.len() { - // Two code units - code -= 0x10000; - let first = to_big_endian_u16(0xD800 | ((code >> 10) as u16)); - let second = to_big_endian_u16(0xDC00 | ((code as u16) & 0x3FF)); - output[output_i] = first[0]; - output[output_i + 1] = first[1]; - output[output_i + 2] = second[0]; - output[output_i + 3] = second[1]; - output_i += 4; - input_i = offset + 1; - } else { - break; - } - } - - // Calculate how much of the input was consumed. - if input_i > input.len() { - input_i = input.len(); - } else { - while !input.is_char_boundary(input_i) { - input_i += 1; - } - } - - Ok((input_i, &output[..output_i])) -} - -pub fn decode_to_str<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { - let mut input_i = 0; - let mut output_i = 0; - - // Loop through the input, getting 2 bytes at a time. - let mut itr = input.chunks(2); - while let Some(bytes) = itr.next() { - if bytes.len() < 2 { - break; - } - - // Decode to scalar value. - let code = { - let code_1 = from_big_endian_u16([bytes[0], bytes[1]]); - if code_1 < 0xD800 || code_1 > 0xDFFF { - // Single code unit. - unsafe { core::char::from_u32_unchecked(code_1 as u32) } - } else if (code_1 & 0xFC00) == 0xDC00 { - // Error: orphaned second half of a surrogate pair. - return Err(DecodeError { - error_range: (input_i, input_i + 2), - output_bytes_written: output_i, - }); - } else { - // Two code units. - - // Get the second code unit, if possible. - if (input_i + 3) >= input.len() { - break; - } - let bytes_2 = itr.next().unwrap(); - let code_2 = from_big_endian_u16([bytes_2[0], bytes_2[1]]); - if (code_2 & 0xFC00) != 0xDC00 { - // Error: second half is not valid surrogate. - return Err(DecodeError { - error_range: (input_i, input_i + 2), - output_bytes_written: output_i, - }); - } - - unsafe { - core::char::from_u32_unchecked( - (((code_1 as u32 - 0xD800) << 10) | (code_2 as u32 - 0xDC00)) + 0x10000, - ) - } - } - }; - - // Encode to utf8. - let mut buf = [0u8; 4]; - let s = code.encode_utf8(&mut buf); - if (output_i + s.len()) > output.len() { - break; - } - output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes()); - - // Update our counters. - input_i += code.len_utf16() * 2; - output_i += s.len(); - } - - Ok((input_i, unsafe { - core::str::from_utf8_unchecked(&output[..output_i]) - })) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn encode_01() { - let text = "こんにちは!"; - let mut buf = [0u8; 1]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(encoded, &[]); - } - - #[test] - fn encode_02() { - let text = "こんにちは!"; - let mut buf = [0u8; 2]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 3); - assert_eq!(encoded, &[0x30, 0x53]); - } - - #[test] - fn encode_03() { - let text = "こんにちは!"; - let mut buf = [0u8; 3]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 3); - assert_eq!(encoded, &[0x30, 0x53]); - } - - #[test] - fn encode_04() { - let text = "😺😼"; - let mut buf = [0u8; 3]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(encoded, &[]); - } - - #[test] - fn encode_05() { - let text = "😺😼"; - let mut buf = [0u8; 4]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(encoded, &[0xD8, 0x3D, 0xDE, 0x3A]); - } - - #[test] - fn encode_06() { - let text = "😺😼"; - let mut buf = [0u8; 7]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(encoded, &[0xD8, 0x3D, 0xDE, 0x3A]); - } - - #[test] - fn decode_01() { - let data = [ - 0x30, 0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0xFF, 0x01, - ]; // "こんにちは!" - let mut buf = [0u8; 2]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(decoded, ""); - } - - #[test] - fn decode_02() { - let data = [ - 0x30, 0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0xFF, 0x01, - ]; // "こんにちは!" - let mut buf = [0u8; 3]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 2); - assert_eq!(decoded, "こ"); - } - - #[test] - fn decode_03() { - let data = [ - 0x30, 0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0xFF, 0x01, - ]; // "こんにちは!" - let mut buf = [0u8; 5]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 2); - assert_eq!(decoded, "こ"); - } - - #[test] - fn decode_04() { - let data = [0xD8, 0x3D, 0xDE, 0x3A, 0xD8, 0x3D, 0xDE, 0x3C]; // "😺😼" - let mut buf = [0u8; 3]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(decoded, ""); - } - - #[test] - fn decode_05() { - let data = [0xD8, 0x3D, 0xDE, 0x3A, 0xD8, 0x3D, 0xDE, 0x3C]; // "😺😼" - let mut buf = [0u8; 4]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_06() { - let data = [0xD8, 0x3D, 0xDE, 0x3A, 0xD8, 0x3D, 0xDE, 0x3C]; // "😺😼" - let mut buf = [0u8; 7]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_07() { - let data = [0xD8, 0x3D, 0xDE, 0x3A, 0xD8, 0x3D]; // "😺😼" with last codepoint chopped off. - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_08() { - let data = [0xD8, 0x3D, 0xDE, 0x3A, 0xD8, 0x3D, 0xDE]; // "😺😼" with last byte chopped off. - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_09() { - let data = [0xD8, 0x3D, 0xDE, 0x3A, 0xD8]; // "😺😼" with last 3 bytes chopped off. - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_error_01() { - let data = [ - 0xDE, 0x3A, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0xFF, 0x01, - ]; // "こんにちは!" with an error on the first char (end surrogate) - let mut buf = [0u8; 2]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (0, 2), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn decode_error_02() { - let data = [ - 0x30, 0x53, 0xDE, 0x3A, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0xFF, 0x01, - ]; // "こんにちは!" with an error on the second char (end surrogate) - let mut buf = [0u8; 3]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (2, 4), - output_bytes_written: 3, - }) - ); - } - - #[test] - fn decode_error_03() { - let data = [ - 0x30, 0x53, 0x30, 0x93, 0x30, 0x6B, 0xDE, 0x3A, 0x30, 0x6F, 0xFF, 0x01, - ]; // "こんにちは!" with an error on the fourth char (end surrogate) - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (6, 8), - output_bytes_written: 9, - }) - ); - } - - #[test] - fn decode_error_04() { - let data = [ - 0xD8, 0x3D, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0xFF, 0x01, - ]; // "こんにちは!" with an error on the first char (start surrogate) - let mut buf = [0u8; 2]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (0, 2), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn decode_error_05() { - let data = [ - 0x30, 0x53, 0xD8, 0x3D, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0xFF, 0x01, - ]; // "こんにちは!" with an error on the second char (start surrogate) - let mut buf = [0u8; 3]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (2, 4), - output_bytes_written: 3, - }) - ); - } - - #[test] - fn decode_error_06() { - let data = [ - 0x30, 0x53, 0x30, 0x93, 0x30, 0x6B, 0xD8, 0x3D, 0x30, 0x6F, 0xFF, 0x01, - ]; // "こんにちは!" with an error on the fourth char (start surrogate) - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (6, 8), - output_bytes_written: 9, - }) - ); - } -} diff --git a/sub_crates/text_encoding/src/utf16_le.rs b/sub_crates/text_encoding/src/utf16_le.rs deleted file mode 100644 index de4eba8..0000000 --- a/sub_crates/text_encoding/src/utf16_le.rs +++ /dev/null @@ -1,362 +0,0 @@ -//! Encoding/decoding functions for little-endian UTF-16. -//! -//! Because both utf8 and utf16 can represent the entirety of unicode, the -//! only possible error is when invalid utf16 is encountered when decoding -//! to utf8. - -use core; -use utils::{from_little_endian_u16, to_little_endian_u16}; -use {DecodeError, DecodeResult, EncodeResult}; - -pub fn encode_from_str<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { - // Do the encode. - let mut input_i = 0; - let mut output_i = 0; - for (offset, c) in input.char_indices() { - let mut code = c as u32; - if code <= 0xFFFF { - // One code unit - if (output_i + 1) < output.len() { - let val = to_little_endian_u16(code as u16); - output[output_i] = val[0]; - output[output_i + 1] = val[1]; - output_i += 2; - input_i = offset + 1; - } else { - break; - } - } else if (output_i + 3) < output.len() { - // Two code units - code -= 0x10000; - let first = to_little_endian_u16(0xD800 | ((code >> 10) as u16)); - let second = to_little_endian_u16(0xDC00 | ((code as u16) & 0x3FF)); - output[output_i] = first[0]; - output[output_i + 1] = first[1]; - output[output_i + 2] = second[0]; - output[output_i + 3] = second[1]; - output_i += 4; - input_i = offset + 1; - } else { - break; - } - } - - // Calculate how much of the input was consumed. - if input_i > input.len() { - input_i = input.len(); - } else { - while !input.is_char_boundary(input_i) { - input_i += 1; - } - } - - Ok((input_i, &output[..output_i])) -} - -pub fn decode_to_str<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { - let mut input_i = 0; - let mut output_i = 0; - - // Loop through the input, getting 2 bytes at a time. - let mut itr = input.chunks(2); - while let Some(bytes) = itr.next() { - if bytes.len() < 2 { - break; - } - - // Decode to scalar value. - let code = { - let code_1 = from_little_endian_u16([bytes[0], bytes[1]]); - if code_1 < 0xD800 || code_1 > 0xDFFF { - // Single code unit. - unsafe { core::char::from_u32_unchecked(code_1 as u32) } - } else if (code_1 & 0xFC00) == 0xDC00 { - // Error: orphaned second half of a surrogate pair. - return Err(DecodeError { - error_range: (input_i, input_i + 2), - output_bytes_written: output_i, - }); - } else { - // Two code units. - - // Get the second code unit, if possible. - if (input_i + 3) >= input.len() { - break; - } - let bytes_2 = itr.next().unwrap(); - let code_2 = from_little_endian_u16([bytes_2[0], bytes_2[1]]); - if (code_2 & 0xFC00) != 0xDC00 { - // Error: second half is not valid surrogate. - return Err(DecodeError { - error_range: (input_i, input_i + 2), - output_bytes_written: output_i, - }); - } - - unsafe { - core::char::from_u32_unchecked( - (((code_1 as u32 - 0xD800) << 10) | (code_2 as u32 - 0xDC00)) + 0x10000, - ) - } - } - }; - - // Encode to utf8. - let mut buf = [0u8; 4]; - let s = code.encode_utf8(&mut buf); - if (output_i + s.len()) > output.len() { - break; - } - output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes()); - - // Update our counters. - input_i += code.len_utf16() * 2; - output_i += s.len(); - } - - Ok((input_i, unsafe { - core::str::from_utf8_unchecked(&output[..output_i]) - })) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn encode_01() { - let text = "こんにちは!"; - let mut buf = [0u8; 1]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(encoded, &[]); - } - - #[test] - fn encode_02() { - let text = "こんにちは!"; - let mut buf = [0u8; 2]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 3); - assert_eq!(encoded, &[0x53, 0x30]); - } - - #[test] - fn encode_03() { - let text = "こんにちは!"; - let mut buf = [0u8; 3]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 3); - assert_eq!(encoded, &[0x53, 0x30]); - } - - #[test] - fn encode_04() { - let text = "😺😼"; - let mut buf = [0u8; 3]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(encoded, &[]); - } - - #[test] - fn encode_05() { - let text = "😺😼"; - let mut buf = [0u8; 4]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(encoded, &[0x3D, 0xD8, 0x3A, 0xDE]); - } - - #[test] - fn encode_06() { - let text = "😺😼"; - let mut buf = [0u8; 7]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(encoded, &[0x3D, 0xD8, 0x3A, 0xDE]); - } - - #[test] - fn decode_01() { - let data = [ - 0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0x30, 0x01, 0xFF, - ]; // "こんにちは!" - let mut buf = [0u8; 2]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(decoded, ""); - } - - #[test] - fn decode_02() { - let data = [ - 0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0x30, 0x01, 0xFF, - ]; // "こんにちは!" - let mut buf = [0u8; 3]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 2); - assert_eq!(decoded, "こ"); - } - - #[test] - fn decode_03() { - let data = [ - 0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0x30, 0x01, 0xFF, - ]; // "こんにちは!" - let mut buf = [0u8; 5]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 2); - assert_eq!(decoded, "こ"); - } - - #[test] - fn decode_04() { - let data = [0x3D, 0xD8, 0x3A, 0xDE, 0x3D, 0xD8, 0x3C, 0xDE]; // "😺😼" - let mut buf = [0u8; 3]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(decoded, ""); - } - - #[test] - fn decode_05() { - let data = [0x3D, 0xD8, 0x3A, 0xDE, 0x3D, 0xD8, 0x3C, 0xDE]; // "😺😼" - let mut buf = [0u8; 4]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_06() { - let data = [0x3D, 0xD8, 0x3A, 0xDE, 0x3D, 0xD8, 0x3C, 0xDE]; // "😺😼" - let mut buf = [0u8; 7]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_07() { - let data = [0x3D, 0xD8, 0x3A, 0xDE, 0x3D, 0xD8]; // "😺😼" with last codepoint chopped off. - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_08() { - let data = [0x3D, 0xD8, 0x3A, 0xDE, 0x3D, 0xD8, 0x3C]; // "😺😼" with last byte chopped off. - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_09() { - let data = [0x3D, 0xD8, 0x3A, 0xDE, 0x3D]; // "😺😼" with last 3 bytes chopped off. - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_error_01() { - let data = [ - 0x3A, 0xDE, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0x30, 0x01, 0xFF, - ]; // "こんにちは!" with an error on the first char (end surrogate) - let mut buf = [0u8; 2]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (0, 2), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn decode_error_02() { - let data = [ - 0x53, 0x30, 0x3A, 0xDE, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0x30, 0x01, 0xFF, - ]; // "こんにちは!" with an error on the second char (end surrogate) - let mut buf = [0u8; 3]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (2, 4), - output_bytes_written: 3, - }) - ); - } - - #[test] - fn decode_error_03() { - let data = [ - 0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x3A, 0xDE, 0x6F, 0x30, 0x01, 0xFF, - ]; // "こんにちは!" with an error on the fourth char (end surrogate) - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (6, 8), - output_bytes_written: 9, - }) - ); - } - - #[test] - fn decode_error_04() { - let data = [ - 0x3D, 0xD8, 0x93, 0x30, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0x30, 0x01, 0xFF, - ]; // "こんにちは!" with an error on the first char (start surrogate) - let mut buf = [0u8; 2]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (0, 2), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn decode_error_05() { - let data = [ - 0x53, 0x30, 0x3D, 0xD8, 0x6B, 0x30, 0x61, 0x30, 0x6F, 0x30, 0x01, 0xFF, - ]; // "こんにちは!" with an error on the second char (start surrogate) - let mut buf = [0u8; 3]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (2, 4), - output_bytes_written: 3, - }) - ); - } - - #[test] - fn decode_error_06() { - let data = [ - 0x53, 0x30, 0x93, 0x30, 0x6B, 0x30, 0x3D, 0xD8, 0x6F, 0x30, 0x01, 0xFF, - ]; // "こんにちは!" with an error on the fourth char (start surrogate) - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (6, 8), - output_bytes_written: 9, - }) - ); - } -} diff --git a/sub_crates/text_encoding/src/utf32_be.rs b/sub_crates/text_encoding/src/utf32_be.rs deleted file mode 100644 index faaf0e1..0000000 --- a/sub_crates/text_encoding/src/utf32_be.rs +++ /dev/null @@ -1,327 +0,0 @@ -//! Encoding/decoding functions for big-endian UTF-32. -//! -//! Because both utf8 and utf32 can represent the entirety of unicode, the -//! only possible error is when invalid utf32 is encountered when decoding -//! to utf8. - -use core; -use utils::{from_big_endian_u32, to_big_endian_u32}; -use {DecodeError, DecodeResult, EncodeResult}; - -pub fn encode_from_str<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { - // Do the encode. - let mut input_i = 0; - let mut output_i = 0; - for (offset, c) in input.char_indices() { - if (output_i + 3) < output.len() { - let mut code = to_big_endian_u32(c as u32); - output[output_i] = code[0]; - output[output_i + 1] = code[1]; - output[output_i + 2] = code[2]; - output[output_i + 3] = code[3]; - output_i += 4; - input_i = offset + 1; - } else { - break; - } - } - - // Calculate how much of the input was consumed. - if input_i > input.len() { - input_i = input.len(); - } else { - while !input.is_char_boundary(input_i) { - input_i += 1; - } - } - - Ok((input_i, &output[..output_i])) -} - -pub fn decode_to_str<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { - let mut input_i = 0; - let mut output_i = 0; - - // Loop through the input, getting 4 bytes at a time. - let mut itr = input.chunks(4); - while let Some(bytes) = itr.next() { - if bytes.len() < 4 { - break; - } - - // Do the decode. - if let Some(code) = core::char::from_u32(from_big_endian_u32([ - bytes[0], bytes[1], bytes[2], bytes[3], - ])) { - // Encode to utf8. - let mut buf = [0u8; 4]; - let s = code.encode_utf8(&mut buf); - if (output_i + s.len()) > output.len() { - break; - } - output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes()); - - // Update our counters. - input_i += 4; - output_i += s.len(); - } else { - // Error: invalid codepoint. - return Err(DecodeError { - error_range: (input_i, input_i + 4), - output_bytes_written: output_i, - }); - } - } - - Ok((input_i, unsafe { - core::str::from_utf8_unchecked(&output[..output_i]) - })) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn encode_01() { - let text = "こんにちは!"; - let mut buf = [0u8; 3]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(encoded, &[]); - } - - #[test] - fn encode_02() { - let text = "こんにちは!"; - let mut buf = [0u8; 4]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 3); - assert_eq!(encoded, &[0x00, 0x00, 0x30, 0x53]); - } - - #[test] - fn encode_03() { - let text = "こんにちは!"; - let mut buf = [0u8; 7]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 3); - assert_eq!(encoded, &[0x00, 0x00, 0x30, 0x53]); - } - - #[test] - fn encode_04() { - let text = "😺😼"; - let mut buf = [0u8; 3]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(encoded, &[]); - } - - #[test] - fn encode_05() { - let text = "😺😼"; - let mut buf = [0u8; 4]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(encoded, &[0x00, 0x01, 0xF6, 0x3A]); - } - - #[test] - fn encode_06() { - let text = "😺😼"; - let mut buf = [0u8; 7]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(encoded, &[0x00, 0x01, 0xF6, 0x3A]); - } - - #[test] - fn decode_01() { - let data = [ - 0x00, 0x00, 0x30, 0x53, 0x00, 0x00, 0x30, 0x93, 0x00, 0x00, 0x30, 0x6B, 0x00, 0x00, - 0x30, 0x61, 0x00, 0x00, 0x30, 0x6F, 0x00, 0x00, 0xFF, 0x01, - ]; // "こんにちは!" - let mut buf = [0u8; 2]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(decoded, ""); - } - - #[test] - fn decode_02() { - let data = [ - 0x00, 0x00, 0x30, 0x53, 0x00, 0x00, 0x30, 0x93, 0x00, 0x00, 0x30, 0x6B, 0x00, 0x00, - 0x30, 0x61, 0x00, 0x00, 0x30, 0x6F, 0x00, 0x00, 0xFF, 0x01, - ]; // "こんにちは!" - let mut buf = [0u8; 3]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "こ"); - } - - #[test] - fn decode_03() { - let data = [ - 0x00, 0x00, 0x30, 0x53, 0x00, 0x00, 0x30, 0x93, 0x00, 0x00, 0x30, 0x6B, 0x00, 0x00, - 0x30, 0x61, 0x00, 0x00, 0x30, 0x6F, 0x00, 0x00, 0xFF, 0x01, - ]; // "こんにちは!" - let mut buf = [0u8; 5]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "こ"); - } - - #[test] - fn decode_04() { - let data = [0x00, 0x01, 0xF6, 0x3A, 0x00, 0x01, 0xF6, 0x3C]; // "😺😼" - let mut buf = [0u8; 3]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(decoded, ""); - } - - #[test] - fn decode_05() { - let data = [0x00, 0x01, 0xF6, 0x3A, 0x00, 0x01, 0xF6, 0x3C]; // "😺😼" - let mut buf = [0u8; 4]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_06() { - let data = [0x00, 0x01, 0xF6, 0x3A, 0x00, 0x01, 0xF6, 0x3C]; // "😺😼" - let mut buf = [0u8; 7]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_07() { - let data = [0x00, 0x01, 0xF6, 0x3A, 0x00, 0x01, 0xF6]; // "😺😼" with last byte chopped off. - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_08() { - let data = [0x00, 0x01, 0xF6, 0x3A, 0x00, 0x01]; // "😺😼" with last 2 bytes chopped off. - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_09() { - let data = [0x00, 0x01, 0xF6, 0x3A, 0x00]; // "😺😼" with last 3 bytes chopped off. - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_error_01() { - let data = [ - 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x30, 0x93, 0x00, 0x00, 0x30, 0x6B, 0x00, 0x00, - 0x30, 0x61, 0x00, 0x00, 0x30, 0x6F, 0x00, 0x00, 0xFF, 0x01, - ]; // "こんにちは!" with an error on the first char (value out of range) - let mut buf = [0u8; 2]; - assert_eq!( - decode_to_str(&data, &mut buf), - Err(DecodeError { - error_range: (0, 4), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn decode_error_02() { - let data = [ - 0x00, 0x00, 0xD8, 0x00, 0x00, 0x00, 0x30, 0x93, 0x00, 0x00, 0x30, 0x6B, 0x00, 0x00, - 0x30, 0x61, 0x00, 0x00, 0x30, 0x6F, 0x00, 0x00, 0xFF, 0x01, - ]; // "こんにちは!" with an error on the first char (value in surrogate range) - let mut buf = [0u8; 2]; - assert_eq!( - decode_to_str(&data, &mut buf), - Err(DecodeError { - error_range: (0, 4), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn decode_error_03() { - let data = [ - 0x00, 0x00, 0xDF, 0xFF, 0x00, 0x00, 0x30, 0x93, 0x00, 0x00, 0x30, 0x6B, 0x00, 0x00, - 0x30, 0x61, 0x00, 0x00, 0x30, 0x6F, 0x00, 0x00, 0xFF, 0x01, - ]; // "こんにちは!" with an error on the first char (value in surrogate range) - let mut buf = [0u8; 64]; - assert_eq!( - decode_to_str(&data, &mut buf), - Err(DecodeError { - error_range: (0, 4), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn decode_error_04() { - let data = [ - 0x00, 0x00, 0x30, 0x53, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x30, 0x6B, 0x00, 0x00, - 0x30, 0x61, 0x00, 0x00, 0x30, 0x6F, 0x00, 0x00, 0xFF, 0x01, - ]; // "こんにちは!" with an error on the second char (value out of range) - let mut buf = [0u8; 64]; - assert_eq!( - decode_to_str(&data, &mut buf), - Err(DecodeError { - error_range: (4, 8), - output_bytes_written: 3, - }) - ); - assert_eq!(&buf[..3], &[0xE3, 0x81, 0x93]); - } - - #[test] - fn decode_error_05() { - let data = [ - 0x00, 0x00, 0x30, 0x53, 0x00, 0x00, 0xD8, 0x00, 0x00, 0x00, 0x30, 0x6B, 0x00, 0x00, - 0x30, 0x61, 0x00, 0x00, 0x30, 0x6F, 0x00, 0x00, 0xFF, 0x01, - ]; // "こんにちは!" with an error on the second char (value in surrogate range) - let mut buf = [0u8; 64]; - assert_eq!( - decode_to_str(&data, &mut buf), - Err(DecodeError { - error_range: (4, 8), - output_bytes_written: 3, - }) - ); - assert_eq!(&buf[..3], &[0xE3, 0x81, 0x93]); - } - - #[test] - fn decode_error_06() { - let data = [ - 0x00, 0x00, 0x30, 0x53, 0x00, 0x00, 0xDF, 0xFF, 0x00, 0x00, 0x30, 0x6B, 0x00, 0x00, - 0x30, 0x61, 0x00, 0x00, 0x30, 0x6F, 0x00, 0x00, 0xFF, 0x01, - ]; // "こんにちは!" with an error on the second char (value in surrogate range) - let mut buf = [0u8; 64]; - assert_eq!( - decode_to_str(&data, &mut buf), - Err(DecodeError { - error_range: (4, 8), - output_bytes_written: 3, - }) - ); - assert_eq!(&buf[..3], &[0xE3, 0x81, 0x93]); - } -} diff --git a/sub_crates/text_encoding/src/utf32_le.rs b/sub_crates/text_encoding/src/utf32_le.rs deleted file mode 100644 index 8c44351..0000000 --- a/sub_crates/text_encoding/src/utf32_le.rs +++ /dev/null @@ -1,327 +0,0 @@ -//! Encoding/decoding functions for big-endian UTF-32. -//! -//! Because both utf8 and utf32 can represent the entirety of unicode, the -//! only possible error is when invalid utf32 is encountered when decoding -//! to utf8. - -use core; -use utils::{from_little_endian_u32, to_little_endian_u32}; -use {DecodeError, DecodeResult, EncodeResult}; - -pub fn encode_from_str<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { - // Do the encode. - let mut input_i = 0; - let mut output_i = 0; - for (offset, c) in input.char_indices() { - if (output_i + 3) < output.len() { - let mut code = to_little_endian_u32(c as u32); - output[output_i] = code[0]; - output[output_i + 1] = code[1]; - output[output_i + 2] = code[2]; - output[output_i + 3] = code[3]; - output_i += 4; - input_i = offset + 1; - } else { - break; - } - } - - // Calculate how much of the input was consumed. - if input_i > input.len() { - input_i = input.len(); - } else { - while !input.is_char_boundary(input_i) { - input_i += 1; - } - } - - Ok((input_i, &output[..output_i])) -} - -pub fn decode_to_str<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { - let mut input_i = 0; - let mut output_i = 0; - - // Loop through the input, getting 4 bytes at a time. - let mut itr = input.chunks(4); - while let Some(bytes) = itr.next() { - if bytes.len() < 4 { - break; - } - - // Do the decode. - if let Some(code) = core::char::from_u32(from_little_endian_u32([ - bytes[0], bytes[1], bytes[2], bytes[3], - ])) { - // Encode to utf8. - let mut buf = [0u8; 4]; - let s = code.encode_utf8(&mut buf); - if (output_i + s.len()) > output.len() { - break; - } - output[output_i..(output_i + s.len())].copy_from_slice(s.as_bytes()); - - // Update our counters. - input_i += 4; - output_i += s.len(); - } else { - // Error: invalid codepoint. - return Err(DecodeError { - error_range: (input_i, input_i + 4), - output_bytes_written: output_i, - }); - } - } - - Ok((input_i, unsafe { - core::str::from_utf8_unchecked(&output[..output_i]) - })) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn encode_01() { - let text = "こんにちは!"; - let mut buf = [0u8; 3]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(encoded, &[]); - } - - #[test] - fn encode_02() { - let text = "こんにちは!"; - let mut buf = [0u8; 4]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 3); - assert_eq!(encoded, &[0x53, 0x30, 0x00, 0x00]); - } - - #[test] - fn encode_03() { - let text = "こんにちは!"; - let mut buf = [0u8; 7]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 3); - assert_eq!(encoded, &[0x53, 0x30, 0x00, 0x00]); - } - - #[test] - fn encode_04() { - let text = "😺😼"; - let mut buf = [0u8; 3]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(encoded, &[]); - } - - #[test] - fn encode_05() { - let text = "😺😼"; - let mut buf = [0u8; 4]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(encoded, &[0x3A, 0xF6, 0x01, 0x00]); - } - - #[test] - fn encode_06() { - let text = "😺😼"; - let mut buf = [0u8; 7]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(encoded, &[0x3A, 0xF6, 0x01, 0x00]); - } - - #[test] - fn decode_01() { - let data = [ - 0x53, 0x30, 0x00, 0x00, 0x93, 0x30, 0x00, 0x00, 0x6B, 0x30, 0x00, 0x00, 0x61, 0x30, - 0x00, 0x00, 0x6F, 0x30, 0x00, 0x00, 0x01, 0xFF, 0x00, 0x00, - ]; // "こんにちは!" - let mut buf = [0u8; 2]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(decoded, ""); - } - - #[test] - fn decode_02() { - let data = [ - 0x53, 0x30, 0x00, 0x00, 0x93, 0x30, 0x00, 0x00, 0x6B, 0x30, 0x00, 0x00, 0x61, 0x30, - 0x00, 0x00, 0x6F, 0x30, 0x00, 0x00, 0x01, 0xFF, 0x00, 0x00, - ]; // "こんにちは!" - let mut buf = [0u8; 3]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "こ"); - } - - #[test] - fn decode_03() { - let data = [ - 0x53, 0x30, 0x00, 0x00, 0x93, 0x30, 0x00, 0x00, 0x6B, 0x30, 0x00, 0x00, 0x61, 0x30, - 0x00, 0x00, 0x6F, 0x30, 0x00, 0x00, 0x01, 0xFF, 0x00, 0x00, - ]; // "こんにちは!" - let mut buf = [0u8; 5]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "こ"); - } - - #[test] - fn decode_04() { - let data = [0x3A, 0xF6, 0x01, 0x00, 0x3C, 0xF6, 0x01, 0x00]; // "😺😼" - let mut buf = [0u8; 3]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(decoded, ""); - } - - #[test] - fn decode_05() { - let data = [0x3A, 0xF6, 0x01, 0x00, 0x3C, 0xF6, 0x01, 0x00]; // "😺😼" - let mut buf = [0u8; 4]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_06() { - let data = [0x3A, 0xF6, 0x01, 0x00, 0x3C, 0xF6, 0x01, 0x00]; // "😺😼" - let mut buf = [0u8; 7]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_07() { - let data = [0x3A, 0xF6, 0x01, 0x00, 0x3C, 0xF6, 0x01]; // "😺😼" with last byte chopped off. - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_08() { - let data = [0x3A, 0xF6, 0x01, 0x00, 0x3C, 0xF6]; // "😺😼" with last 2 bytes chopped off. - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_09() { - let data = [0x3A, 0xF6, 0x01, 0x00, 0x3C]; // "😺😼" with last 3 bytes chopped off. - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 4); - assert_eq!(decoded, "😺"); - } - - #[test] - fn decode_error_01() { - let data = [ - 0x00, 0x00, 0x11, 0x00, 0x93, 0x30, 0x00, 0x00, 0x6B, 0x30, 0x00, 0x00, 0x61, 0x30, - 0x00, 0x00, 0x6F, 0x30, 0x00, 0x00, 0x01, 0xFF, 0x00, 0x00, - ]; // "こんにちは!" with an error on the first char (value out of range) - let mut buf = [0u8; 2]; - assert_eq!( - decode_to_str(&data, &mut buf), - Err(DecodeError { - error_range: (0, 4), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn decode_error_02() { - let data = [ - 0x00, 0xD8, 0x00, 0x00, 0x93, 0x30, 0x00, 0x00, 0x6B, 0x30, 0x00, 0x00, 0x61, 0x30, - 0x00, 0x00, 0x6F, 0x30, 0x00, 0x00, 0x01, 0xFF, 0x00, 0x00, - ]; // "こんにちは!" with an error on the first char (value in surrogate range) - let mut buf = [0u8; 2]; - assert_eq!( - decode_to_str(&data, &mut buf), - Err(DecodeError { - error_range: (0, 4), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn decode_error_03() { - let data = [ - 0xFF, 0xDF, 0x00, 0x00, 0x93, 0x30, 0x00, 0x00, 0x6B, 0x30, 0x00, 0x00, 0x61, 0x30, - 0x00, 0x00, 0x6F, 0x30, 0x00, 0x00, 0x01, 0xFF, 0x00, 0x00, - ]; // "こんにちは!" with an error on the first char (value in surrogate range) - let mut buf = [0u8; 64]; - assert_eq!( - decode_to_str(&data, &mut buf), - Err(DecodeError { - error_range: (0, 4), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn decode_error_04() { - let data = [ - 0x53, 0x30, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x6B, 0x30, 0x00, 0x00, 0x61, 0x30, - 0x00, 0x00, 0x6F, 0x30, 0x00, 0x00, 0x01, 0xFF, 0x00, 0x00, - ]; // "こんにちは!" with an error on the second char (value out of range) - let mut buf = [0u8; 64]; - assert_eq!( - decode_to_str(&data, &mut buf), - Err(DecodeError { - error_range: (4, 8), - output_bytes_written: 3, - }) - ); - assert_eq!(&buf[..3], &[0xE3, 0x81, 0x93]); - } - - #[test] - fn decode_error_05() { - let data = [ - 0x53, 0x30, 0x00, 0x00, 0x00, 0xD8, 0x00, 0x00, 0x6B, 0x30, 0x00, 0x00, 0x61, 0x30, - 0x00, 0x00, 0x6F, 0x30, 0x00, 0x00, 0x01, 0xFF, 0x00, 0x00, - ]; // "こんにちは!" with an error on the second char (value in surrogate range) - let mut buf = [0u8; 64]; - assert_eq!( - decode_to_str(&data, &mut buf), - Err(DecodeError { - error_range: (4, 8), - output_bytes_written: 3, - }) - ); - assert_eq!(&buf[..3], &[0xE3, 0x81, 0x93]); - } - - #[test] - fn decode_error_06() { - let data = [ - 0x53, 0x30, 0x00, 0x00, 0xFF, 0xDF, 0x00, 0x00, 0x6B, 0x30, 0x00, 0x00, 0x61, 0x30, - 0x00, 0x00, 0x6F, 0x30, 0x00, 0x00, 0x01, 0xFF, 0x00, 0x00, - ]; // "こんにちは!" with an error on the second char (value in surrogate range) - let mut buf = [0u8; 64]; - assert_eq!( - decode_to_str(&data, &mut buf), - Err(DecodeError { - error_range: (4, 8), - output_bytes_written: 3, - }) - ); - assert_eq!(&buf[..3], &[0xE3, 0x81, 0x93]); - } -} diff --git a/sub_crates/text_encoding/src/utf8.rs b/sub_crates/text_encoding/src/utf8.rs deleted file mode 100644 index 169479b..0000000 --- a/sub_crates/text_encoding/src/utf8.rs +++ /dev/null @@ -1,338 +0,0 @@ -//! These functions are essentially redundant, since they're supposedly -//! encoding/decoding between utf8 and... utf8. However, `decode_to_str()` -//! is still useful for validating unknown input. And they allow a uniform -//! API for all encodings. - -use core; -use {DecodeError, DecodeResult, EncodeResult}; - -pub fn encode_from_str<'a>(input: &str, output: &'a mut [u8]) -> EncodeResult<'a> { - let cl = copy_len(input.as_bytes(), output.len()); - output[..cl].copy_from_slice(input[..cl].as_bytes()); - Ok((cl, &output[..cl])) -} - -pub fn decode_to_str<'a>(input: &[u8], output: &'a mut [u8]) -> DecodeResult<'a> { - // Find how much of the data is valid utf8. - let valid_up_to = match core::str::from_utf8(input) { - Ok(text) => text.len(), - Err(e) => e.valid_up_to(), - }; - - // Copy over what we can. - let bytes_copied = copy_len(&input[..valid_up_to], output.len()); - output[..bytes_copied].copy_from_slice(&input[..bytes_copied]); - - // Determine if there's an error. - if bytes_copied < output.len() && bytes_copied == valid_up_to && valid_up_to < input.len() { - let trailing_bytes = input.len() - valid_up_to; - let byte = input[valid_up_to]; - // First we check if we're truncated. If we are, then don't error - // yet, because we want to provide the full byte range of the error. - let is_truncated = ((byte & 0b11100000) == 0b11000000 && trailing_bytes < 2) - || ((byte & 0b11110000) == 0b11100000 && trailing_bytes < 3) - || ((byte & 0b11111000) == 0b11110000 && trailing_bytes < 4); - if !is_truncated { - // Find the byte range of the error by finding the next valid - // starting byte (or reaching end of input). - let mut i = valid_up_to + 1; - while i < input.len() - && ((input[i] & 0b11000000) == 0b10000000 || (input[i] & 0b11111000) == 0b11111000) - { - i += 1; - } - // Return the error. - return Err(DecodeError { - error_range: (valid_up_to, i), - output_bytes_written: bytes_copied, - }); - } - } - - // No error, return success. - Ok((bytes_copied, unsafe { - core::str::from_utf8_unchecked(&output[..bytes_copied]) - })) -} - -/// Calculates how many bytes should be copied from input to output given -/// their lengths and the content of input. Specifically, it calculates -/// the maximum amount that can be copied without incompletely copying -/// any multi-byte codepoints. -/// -/// Input is assumed to be valid and complete utf8 (i.e. could be turned -/// directly into a &str). -#[inline(always)] -fn copy_len(input: &[u8], output_len: usize) -> usize { - if output_len >= input.len() { - input.len() - } else { - let mut i = output_len; - while i > 0 && (input[i] & 0b11000000) == 0b10000000 { - i -= 1; - } - i - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn encode_01() { - let text = "こんにちは!"; - let mut buf = [0u8; 2]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(encoded, &[]); - } - - #[test] - fn encode_02() { - let text = "こんにちは!"; - let mut buf = [0u8; 3]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 3); - assert_eq!(encoded, &[0xE3, 0x81, 0x93]); - } - - #[test] - fn encode_03() { - let text = "こんにちは!"; - let mut buf = [0u8; 5]; - let (consumed_count, encoded) = encode_from_str(text, &mut buf).unwrap(); - assert_eq!(consumed_count, 3); - assert_eq!(encoded, &[0xE3, 0x81, 0x93]); - } - - #[test] - fn decode_01() { - let data = [ - 0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, - 0xAF, 0xEF, 0xBC, 0x81, - ]; // "こんにちは!" - let mut buf = [0u8; 2]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 0); - assert_eq!(decoded, ""); - } - - #[test] - fn decode_02() { - let data = [ - 0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, - 0xAF, 0xEF, 0xBC, 0x81, - ]; // "こんにちは!" - let mut buf = [0u8; 3]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 3); - assert_eq!(decoded, "こ"); - } - - #[test] - fn decode_03() { - let data = [ - 0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, - 0xAF, 0xEF, 0xBC, 0x81, - ]; // "こんにちは!" - let mut buf = [0u8; 5]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 3); - assert_eq!(decoded, "こ"); - } - - #[test] - fn decode_04() { - let data = [ - 0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, - 0xAF, 0xEF, 0xBC, - ]; // "こんにちは!" with last byte chopped off. - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 15); - assert_eq!(decoded, "こんにちは"); - } - - #[test] - fn decode_05() { - let data = [ - 0xE3, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, - 0xAF, 0xEF, - ]; // "こんにちは!" with last 2 bytes chopped off. - let mut buf = [0u8; 64]; - let (consumed_count, decoded) = decode_to_str(&data, &mut buf).unwrap(); - assert_eq!(consumed_count, 15); - assert_eq!(decoded, "こんにちは"); - } - - #[test] - fn decode_error_01() { - let data = [ - 0b10000000, 0x81, 0x93, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, - 0x81, 0xAF, 0xEF, 0xBC, 0x81, - ]; // "こんにちは!" with an error on the first char (continuing code unit). - let mut buf = [0u8; 2]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (0, 3), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn decode_error_02() { - let data = [ - 0xE3, 0x81, 0xE3, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF, - 0xEF, 0xBC, 0x81, - ]; // "こんにちは!" with an error on the first code point (too few continuing code units). - let mut buf = [0u8; 2]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (0, 2), - output_bytes_written: 0, - }) - ); - } - - #[test] - fn decode_error_03() { - let data = [ - 0xE3, 0x81, 0x93, 0b10000000, 0x82, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, - 0x81, 0xAF, 0xEF, 0xBC, 0x81, - ]; // "こんにちは!" with an error on the second code point (continuing code unit). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (3, 6), - output_bytes_written: 3, - }) - ); - } - - #[test] - fn decode_error_04() { - let data = [ - 0xE3, 0x81, 0x93, 0b10000000, 0x82, 0x93, 0b10000000, 0x81, 0xAB, 0b10000000, 0x81, - 0xA1, 0xE3, 0x81, 0xAF, 0xEF, 0xBC, 0x81, - ]; // "こんにちは!" with an error on the second code point (lots of continuing code units). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (3, 12), - output_bytes_written: 3, - }) - ); - } - - #[test] - fn decode_error_05() { - let data = [ - 0xE3, 0x81, 0x93, 0b11111000, 0x82, 0x93, 0x93, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, - 0xA1, 0xE3, 0x81, 0xAF, 0xEF, 0xBC, 0x81, - ]; // "こんにちは!" with an error on the second code point (invalid bit pattern). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (3, 8), - output_bytes_written: 3, - }) - ); - } - - #[test] - fn decode_error_06() { - let data = [ - 0xE3, 0x81, 0x93, 0xED, 0xA0, 0x80, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, - 0xAF, 0xEF, 0xBC, 0x81, - ]; // "こんにちは!" with an error on the second code point (beginning of surrogate range). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (3, 6), - output_bytes_written: 3, - }) - ); - } - - #[test] - fn decode_error_07() { - let data = [ - 0xE3, 0x81, 0x93, 0xED, 0xBF, 0xBF, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, - 0xAF, 0xEF, 0xBC, 0x81, - ]; // "こんにちは!" with an error on the second code point (end of surrogate range). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (3, 6), - output_bytes_written: 3, - }) - ); - } - - #[test] - fn decode_error_08() { - let data = [ - 0xE3, 0x81, 0x93, 0xF4, 0x90, 0x80, 0x80, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, - 0x81, 0xAF, 0xEF, 0xBC, 0x81, - ]; // "こんにちは!" with an error on the second code point (out of unicode range). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (3, 7), - output_bytes_written: 3, - }) - ); - } - - #[test] - fn decode_error_09() { - let data = [ - 0xE3, 0x81, 0x93, 0xC0, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF, - 0xEF, 0xBC, 0x81, - ]; // "こんにちは!" with an error on the second code point (byte == 0xC0). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (3, 5), - output_bytes_written: 3, - }) - ); - } - - #[test] - fn decode_error_10() { - let data = [ - 0xE3, 0x81, 0x93, 0xC1, 0x93, 0xE3, 0x81, 0xAB, 0xE3, 0x81, 0xA1, 0xE3, 0x81, 0xAF, - 0xEF, 0xBC, 0x81, - ]; // "こんにちは!" with an error on the second code point (byte == 0xC1). - let mut buf = [0u8; 64]; - let error = decode_to_str(&data, &mut buf); - assert_eq!( - error, - Err(DecodeError { - error_range: (3, 5), - output_bytes_written: 3, - }) - ); - } -} diff --git a/sub_crates/text_encoding/src/utils.rs b/sub_crates/text_encoding/src/utils.rs deleted file mode 100644 index 9c1efde..0000000 --- a/sub_crates/text_encoding/src/utils.rs +++ /dev/null @@ -1,121 +0,0 @@ -use core::mem::transmute; - -#[inline(always)] -pub(crate) fn to_big_endian_u16(n: u16) -> [u8; 2] { - let ptr = unsafe { transmute::<*const u16, *const u8>(&n as *const u16) }; - if cfg!(target_endian = "little") { - unsafe { [*ptr.offset(1), *ptr] } - } else { - unsafe { [*ptr, *ptr.offset(1)] } - } -} - -#[inline(always)] -pub(crate) fn from_big_endian_u16(n: [u8; 2]) -> u16 { - let mut x: u16 = 0; - let ptr = unsafe { transmute::<*mut u16, *mut u8>(&mut x as *mut u16) }; - if cfg!(target_endian = "little") { - unsafe { - *ptr = n[1]; - *ptr.offset(1) = n[0]; - } - } else { - unsafe { - *ptr = n[0]; - *ptr.offset(1) = n[1]; - } - } - x -} - -#[inline(always)] -pub(crate) fn to_little_endian_u16(n: u16) -> [u8; 2] { - let ptr = unsafe { transmute::<*const u16, *const u8>(&n as *const u16) }; - if cfg!(target_endian = "little") { - unsafe { [*ptr, *ptr.offset(1)] } - } else { - unsafe { [*ptr.offset(1), *ptr] } - } -} - -#[inline(always)] -pub(crate) fn from_little_endian_u16(n: [u8; 2]) -> u16 { - let mut x: u16 = 0; - let ptr = unsafe { transmute::<*mut u16, *mut u8>(&mut x as *mut u16) }; - if cfg!(target_endian = "little") { - unsafe { - *ptr = n[0]; - *ptr.offset(1) = n[1]; - } - } else { - unsafe { - *ptr = n[1]; - *ptr.offset(1) = n[0]; - } - } - x -} - -#[inline(always)] -pub(crate) fn to_big_endian_u32(n: u32) -> [u8; 4] { - let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) }; - if cfg!(target_endian = "little") { - unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] } - } else { - unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] } - } -} - -#[inline(always)] -pub(crate) fn from_big_endian_u32(n: [u8; 4]) -> u32 { - let mut x: u32 = 0; - let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) }; - if cfg!(target_endian = "little") { - unsafe { - *ptr = n[3]; - *ptr.offset(1) = n[2]; - *ptr.offset(2) = n[1]; - *ptr.offset(3) = n[0]; - } - } else { - unsafe { - *ptr = n[0]; - *ptr.offset(1) = n[1]; - *ptr.offset(2) = n[2]; - *ptr.offset(3) = n[3]; - } - } - x -} - -#[inline(always)] -pub(crate) fn to_little_endian_u32(n: u32) -> [u8; 4] { - let ptr = unsafe { transmute::<*const u32, *const u8>(&n as *const u32) }; - if cfg!(target_endian = "little") { - unsafe { [*ptr, *ptr.offset(1), *ptr.offset(2), *ptr.offset(3)] } - } else { - unsafe { [*ptr.offset(3), *ptr.offset(2), *ptr.offset(1), *ptr] } - } -} - -#[inline(always)] -pub(crate) fn from_little_endian_u32(n: [u8; 4]) -> u32 { - let mut x: u32 = 0; - let ptr = unsafe { transmute::<*mut u32, *mut u8>(&mut x as *mut u32) }; - if cfg!(target_endian = "little") { - unsafe { - *ptr = n[0]; - *ptr.offset(1) = n[1]; - *ptr.offset(2) = n[2]; - *ptr.offset(3) = n[3]; - } - } else { - unsafe { - *ptr = n[3]; - *ptr.offset(1) = n[2]; - *ptr.offset(2) = n[1]; - *ptr.offset(3) = n[0]; - } - } - x -} diff --git a/sub_crates/text_encoding/tests/property_tests.rs b/sub_crates/text_encoding/tests/property_tests.rs deleted file mode 100644 index bc8171c..0000000 --- a/sub_crates/text_encoding/tests/property_tests.rs +++ /dev/null @@ -1,239 +0,0 @@ -#[macro_use] -extern crate proptest; -extern crate text_encoding; - -use proptest::collection::vec; -use proptest::test_runner::Config; -use text_encoding::{decode_to_str, encode_from_str, Encoding}; - -proptest! { - #![proptest_config(Config::with_cases(512))] - - #[test] - fn pt_utf8_roundtrip(ref text in "\\PC*\\PC*\\PC*") { - let mut buf = [0u8; 32]; - let mut utf8_encoded: Vec = Vec::new(); - let mut utf8 = String::new(); - - // Encode to utf8 - let mut tmp = &text[..]; - while !tmp.is_empty() { - if let Ok((n, encoded)) = encode_from_str(Encoding::Utf8, tmp, &mut buf) { - tmp = &tmp[n..]; - utf8_encoded.extend_from_slice(encoded); - } else { - panic!("Error when encoding."); - } - } - - // Decode back from utf8 - let mut tmp = &utf8_encoded[..]; - while !tmp.is_empty() { - if let Ok((n, decoded)) = decode_to_str(Encoding::Utf8, tmp, &mut buf) { - tmp = &tmp[n..]; - utf8.extend(decoded.chars()); - } else { - panic!("Error when decoding."); - } - } - - assert_eq!(&text[..], &utf8[..]); - assert_eq!(text.as_bytes(), &utf8_encoded[..]); - assert_eq!(utf8.as_bytes(), &utf8_encoded[..]); - } - - #[test] - fn pt_utf16be_roundtrip(ref text in "\\PC*\\PC*\\PC*") { - let mut buf = [0u8; 32]; - let mut utf16: Vec = Vec::new(); - let mut utf8 = String::new(); - - // Encode to utf16 big endian - let mut tmp = &text[..]; - while !tmp.is_empty() { - if let Ok((n, encoded)) = encode_from_str(Encoding::Utf16BE, tmp, &mut buf) { - tmp = &tmp[n..]; - utf16.extend_from_slice(encoded); - } else { - panic!("Error when encoding."); - } - } - - // Decode back from utf16 big endian - let mut tmp = &utf16[..]; - while !tmp.is_empty() { - if let Ok((n, decoded)) = decode_to_str(Encoding::Utf16BE, tmp, &mut buf) { - tmp = &tmp[n..]; - utf8.extend(decoded.chars()); - } else { - panic!("Error when decoding."); - } - } - - assert_eq!(&text[..], &utf8[..]); - } - - #[test] - fn pt_utf16le_roundtrip(ref text in "\\PC*\\PC*\\PC*") { - let mut buf = [0u8; 32]; - let mut utf16: Vec = Vec::new(); - let mut utf8 = String::new(); - - // Encode to utf16 little endian - let mut tmp = &text[..]; - while !tmp.is_empty() { - if let Ok((n, encoded)) = encode_from_str(Encoding::Utf16LE, tmp, &mut buf) { - tmp = &tmp[n..]; - utf16.extend_from_slice(encoded); - } else { - panic!("Error when encoding."); - } - } - - // Decode back from utf16 big endian - let mut tmp = &utf16[..]; - while !tmp.is_empty() { - if let Ok((n, decoded)) = decode_to_str(Encoding::Utf16LE, tmp, &mut buf) { - tmp = &tmp[n..]; - utf8.extend(decoded.chars()); - } else { - panic!("Error when decoding."); - } - } - - assert_eq!(&text[..], &utf8[..]); - } - - #[test] - fn pt_utf32be_roundtrip(ref text in "\\PC*\\PC*\\PC*") { - let mut buf = [0u8; 32]; - let mut utf32: Vec = Vec::new(); - let mut utf8 = String::new(); - - // Encode to utf32 big endian - let mut tmp = &text[..]; - while !tmp.is_empty() { - if let Ok((n, encoded)) = encode_from_str(Encoding::Utf32BE, tmp, &mut buf) { - tmp = &tmp[n..]; - utf32.extend_from_slice(encoded); - } else { - panic!("Error when encoding."); - } - } - - // Decode back from utf32 big endian - let mut tmp = &utf32[..]; - while !tmp.is_empty() { - if let Ok((n, decoded)) = decode_to_str(Encoding::Utf32BE, tmp, &mut buf) { - tmp = &tmp[n..]; - utf8.extend(decoded.chars()); - } else { - panic!("Error when decoding."); - } - } - - assert_eq!(&text[..], &utf8[..]); - } - - #[test] - fn pt_utf32le_roundtrip(ref text in "\\PC*\\PC*\\PC*") { - let mut buf = [0u8; 32]; - let mut utf32: Vec = Vec::new(); - let mut utf8 = String::new(); - - // Encode to utf32 little endian - let mut tmp = &text[..]; - while !tmp.is_empty() { - if let Ok((n, encoded)) = encode_from_str(Encoding::Utf32LE, tmp, &mut buf) { - tmp = &tmp[n..]; - utf32.extend_from_slice(encoded); - } else { - panic!("Error when encoding."); - } - } - - // Decode back from utf32 little endian - let mut tmp = &utf32[..]; - while !tmp.is_empty() { - if let Ok((n, decoded)) = decode_to_str(Encoding::Utf32LE, tmp, &mut buf) { - tmp = &tmp[n..]; - utf8.extend(decoded.chars()); - } else { - panic!("Error when decoding."); - } - } - - assert_eq!(&text[..], &utf8[..]); - } - - #[test] - fn pt_latin1_roundtrip(ref data in vec(0u8..=255, 0..1000)) { - let mut buf = [0u8; 32]; - let mut utf8 = String::new(); - let mut latin1: Vec = Vec::new(); - - // Decode from latin1 to utf8 - let mut tmp = &data[..]; - while !tmp.is_empty() { - if let Ok((n, decoded)) = decode_to_str(Encoding::Latin1, tmp, &mut buf) { - tmp = &tmp[n..]; - utf8.extend(decoded.chars()); - } else { - panic!("Error when decoding."); - } - } - - // Encode to from utf8 back to latin1 - let mut tmp = &utf8[..]; - while !tmp.is_empty() { - if let Ok((n, encoded)) = encode_from_str(Encoding::Latin1, tmp, &mut buf) { - tmp = &tmp[n..]; - latin1.extend_from_slice(encoded); - } else { - panic!("Error when encoding."); - } - } - - assert_eq!(&data[..], &latin1[..]); - } - - // The iso-8859-7 tests are representative of all single-byte encodings - // (except latin1) since they're all generated and share their code. - #[test] - fn pt_iso_8859_7_roundtrip(mut data in vec(0u8..=255, 0..1000)) { - let mut buf = [0u8; 32]; - let mut utf8 = String::new(); - let mut iso8859_7: Vec = Vec::new(); - - // Eliminate undefined bytes in input. - for b in data.iter_mut() { - if *b == 0xAE || *b == 0xD2 || *b == 0xFF { - *b = 0; - } - } - - // Decode from iso-8859-7 to utf8 - let mut tmp = &data[..]; - while !tmp.is_empty() { - if let Ok((n, decoded)) = decode_to_str(Encoding::ISO8859_7, tmp, &mut buf) { - tmp = &tmp[n..]; - utf8.extend(decoded.chars()); - } else { - panic!("Error when decoding."); - } - } - - // Encode to from utf8 back to iso-8859-7 - let mut tmp = &utf8[..]; - while !tmp.is_empty() { - if let Ok((n, encoded)) = encode_from_str(Encoding::ISO8859_7, tmp, &mut buf) { - tmp = &tmp[n..]; - iso8859_7.extend_from_slice(encoded); - } else { - panic!("Error when encoding."); - } - } - - assert_eq!(&data[..], &iso8859_7[..]); - } -}