From cbad731152602971c57dd8c4bf78fd88fbb5ee13 Mon Sep 17 00:00:00 2001 From: Ryan Date: Tue, 24 Dec 2024 21:21:21 +0100 Subject: [PATCH] Hello libcode-unicode --- .editorconfig | 17 +++++ .gitattributes | 1 + .gitea/workflows/on-push.yaml | 24 +++++++ .gitignore | 31 ++++++++ LICENSE | 31 ++++++++ README.md | 21 ++++++ build/.gitignore | 4 ++ build/bootstrap.build | 7 ++ build/export.build | 6 ++ build/root.build | 16 +++++ buildfile | 5 ++ code/unicode/.gitignore | 9 +++ code/unicode/buildfile | 66 +++++++++++++++++ code/unicode/decoding.hxx | 50 +++++++++++++ code/unicode/decoding.ixx | 13 ++++ code/unicode/decoding.test.cxx | 67 +++++++++++++++++ code/unicode/decoding.txx | 70 ++++++++++++++++++ code/unicode/encoding.hxx | 35 +++++++++ code/unicode/encoding.ixx | 40 +++++++++++ code/unicode/encoding.test.cxx | 128 +++++++++++++++++++++++++++++++++ code/unicode/iterator.hxx | 124 ++++++++++++++++++++++++++++++++ code/unicode/iterator.test.cxx | 33 +++++++++ code/unicode/iterator.txx | 21 ++++++ code/unicode/unicode.hxx | 13 ++++ code/unicode/version.hxx.in | 34 +++++++++ manifest | 12 ++++ repositories.manifest | 6 ++ tests/.gitignore | 8 +++ tests/build/.gitignore | 4 ++ tests/build/bootstrap.build | 5 ++ tests/build/root.build | 16 +++++ tests/buildfile | 1 + 32 files changed, 918 insertions(+) create mode 100644 .editorconfig create mode 100644 .gitattributes create mode 100644 .gitea/workflows/on-push.yaml create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 build/.gitignore create mode 100644 build/bootstrap.build create mode 100644 build/export.build create mode 100644 build/root.build create mode 100644 buildfile create mode 100644 code/unicode/.gitignore create mode 100644 code/unicode/buildfile create mode 100644 code/unicode/decoding.hxx create mode 100644 code/unicode/decoding.ixx create mode 100644 code/unicode/decoding.test.cxx create mode 100644 code/unicode/decoding.txx create mode 100644 code/unicode/encoding.hxx create mode 100644 code/unicode/encoding.ixx create mode 100644 code/unicode/encoding.test.cxx create mode 100644 code/unicode/iterator.hxx create mode 100644 code/unicode/iterator.test.cxx create mode 100644 code/unicode/iterator.txx create mode 100644 code/unicode/unicode.hxx create mode 100644 code/unicode/version.hxx.in create mode 100644 manifest create mode 100644 repositories.manifest create mode 100644 tests/.gitignore create mode 100644 tests/build/.gitignore create mode 100644 tests/build/bootstrap.build create mode 100644 tests/build/root.build create mode 100644 tests/buildfile diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..ec9f3de --- /dev/null +++ b/.editorconfig @@ -0,0 +1,17 @@ +root = true + +[*] +indent_style = space +indent_size = 2 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.md] +indent_size = 4 +max_line_length = off +trim_trailing_whitespace = false + +[*.yaml] +indent_size = 2 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..176a458 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +* text=auto diff --git a/.gitea/workflows/on-push.yaml b/.gitea/workflows/on-push.yaml new file mode 100644 index 0000000..23a9e11 --- /dev/null +++ b/.gitea/workflows/on-push.yaml @@ -0,0 +1,24 @@ +name: on-push +on: [push] + +jobs: + build-and-test: + runs-on: linux + container: code.helloryan.se/infra/buildenv/cxx-amd64-fedora-40:latest + volumes: + - /build + steps: + - name: Clone repository + uses: actions/checkout@v3 + - name: Authenticate + run: | + git config unset http.https://code.helloryan.se/.extraheader + echo "${{ secrets.NETRC }}" >> ~/.netrc + - name: Initialize + run: | + bpkg create -d /build cc config.cc.coptions="-Wall -Werror" + bdep init -A /build + - name: Build + run: b + - name: Test + run: b test diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c96e1ec --- /dev/null +++ b/.gitignore @@ -0,0 +1,31 @@ +.bdep/ + +# Local default options files. +# +.build2/local/ + +# Compiler/linker output. +# +*.d +*.t +*.i +*.i.* +*.ii +*.ii.* +*.o +*.obj +*.gcm +*.pcm +*.ifc +*.so +*.dylib +*.dll +*.a +*.lib +*.exp +*.pdb +*.ilk +*.exe +*.exe.dlls/ +*.exe.manifest +*.pc diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..dfc745b --- /dev/null +++ b/LICENSE @@ -0,0 +1,31 @@ +Copyright © 2024 Ryan. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. All advertising materials mentioning features or use of this software must + display the following acknowledgement: + + This product includes software developed by Ryan, http://helloryan.se/. + +4. Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDER "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN +NO EVENT SHALL COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..7749ed4 --- /dev/null +++ b/README.md @@ -0,0 +1,21 @@ +# libcode-unicode + +![Build status](https://code.helloryan.se/code/libcode-unicode/actions/workflows/on-push.yaml/badge.svg) + +## Requirements + +None, other than a modern C++-compiler. + +## Building + +See the wiki, https://code.helloryan.se/code/wiki/wiki/Build-Instructions, for +build instructions. + +## Contact + +Please report bugs and issues by sending an e-mail to: ryan@helloryan.se. + +## Contributing + +Please send an e-mail to ryan@helloryan.se to request an account and +write-access to the libcode-unicode repository. diff --git a/build/.gitignore b/build/.gitignore new file mode 100644 index 0000000..974e01d --- /dev/null +++ b/build/.gitignore @@ -0,0 +1,4 @@ +/config.build +/root/ +/bootstrap/ +build/ diff --git a/build/bootstrap.build b/build/bootstrap.build new file mode 100644 index 0000000..fed0464 --- /dev/null +++ b/build/bootstrap.build @@ -0,0 +1,7 @@ +project = libcode-unicode + +using version +using config +using test +using install +using dist diff --git a/build/export.build b/build/export.build new file mode 100644 index 0000000..5a210b3 --- /dev/null +++ b/build/export.build @@ -0,0 +1,6 @@ +$out_root/ +{ + include code/unicode/ +} + +export $out_root/code/unicode/$import.target diff --git a/build/root.build b/build/root.build new file mode 100644 index 0000000..21e0a2e --- /dev/null +++ b/build/root.build @@ -0,0 +1,16 @@ +# Uncomment to suppress warnings coming from external libraries. +# +#cxx.internal.scope = current + +cxx.std = latest + +using cxx + +hxx{*}: extension = hxx +ixx{*}: extension = ixx +txx{*}: extension = txx +cxx{*}: extension = cxx + +# The test target for cross-testing (running tests under Wine, etc). +# +test.target = $cxx.target diff --git a/buildfile b/buildfile new file mode 100644 index 0000000..bbe185e --- /dev/null +++ b/buildfile @@ -0,0 +1,5 @@ +./: {code/ tests/} doc{README.md} legal{LICENSE} manifest + +# Don't install tests. +# +tests/: install = false diff --git a/code/unicode/.gitignore b/code/unicode/.gitignore new file mode 100644 index 0000000..b1ed0e0 --- /dev/null +++ b/code/unicode/.gitignore @@ -0,0 +1,9 @@ +# Generated version header. +# +version.hxx + +# Unit test executables and Testscript output directories +# (can be symlinks). +# +*.test +test-*.test diff --git a/code/unicode/buildfile b/code/unicode/buildfile new file mode 100644 index 0000000..fcd631c --- /dev/null +++ b/code/unicode/buildfile @@ -0,0 +1,66 @@ +intf_libs = # Interface dependencies. +impl_libs = # Implementation dependencies. + +./: lib{code-unicode}: libul{code-unicode} + +libul{code-unicode}: {hxx ixx txx cxx}{** -**.test... -version} \ + {hxx }{ version} + +libul{code-unicode}: $impl_libs $intf_libs + +# Unit tests. +# +exe{*.test}: +{ + test = true + install = false +} + +test_libs = +import test_libs =+ libcode-validation%lib{code-validation} + +for t: cxx{**.test...} +{ + d = $directory($t) + n = $name($t)... + + ./: $d/exe{$n}: $t $d/{hxx ixx txx}{+$n} $d/testscript{+$n} $test_libs + $d/exe{$n}: libul{code-unicode}: bin.whole = false + $d/exe{$n}: test.arguments = -v -v +} + +hxx{version}: in{version} $src_root/manifest +{ + dist = true + clean = ($src_root != $out_root) +} + +# Build options. +# +cxx.poptions =+ "-I$out_root" "-I$src_root" + +# Export options. +# +lib{code-unicode}: +{ + cxx.export.poptions = "-I$out_root" "-I$src_root" + cxx.export.libs = $intf_libs +} + +# For pre-releases use the complete version to make sure they cannot +# be used in place of another pre-release or the final version. See +# the version module for details on the version.* variable values. +# +if $version.pre_release + lib{code-unicode}: bin.lib.version = "-$version.project_id" +else + lib{code-unicode}: bin.lib.version = "-$version.major.$version.minor" + +# Install into the code/unicode/ subdirectory of, say, /usr/include/ +# recreating subdirectories. +# +{hxx ixx txx}{*}: +{ + install = include/code/unicode/ + install.subdirs = true +} diff --git a/code/unicode/decoding.hxx b/code/unicode/decoding.hxx new file mode 100644 index 0000000..348d7f2 --- /dev/null +++ b/code/unicode/decoding.hxx @@ -0,0 +1,50 @@ +#ifndef code__unicode__decoding_hxx_ +#define code__unicode__decoding_hxx_ + +#include + +#include +#include +#include + +namespace code::unicode +{ + + class Decoder + { + public: + virtual + ~Decoder() = default; + + virtual + std::uint32_t + extract(std::istream& i) const = 0; + + }; + + class Utf8_decoder + : public Decoder + { + public: + std::uint32_t + extract(std::istream& i) const override; + + template< typename InputIterator > + std::uint32_t + decode(InputIterator& it, InputIterator const& end) const; + + template< typename InputIterator > + std::uint32_t + decode(InputIterator&& it, InputIterator const& end) const + { + return decode(it, end); + } + + }; + +} // namespace code::unicode + +#include +#include + +#endif diff --git a/code/unicode/decoding.ixx b/code/unicode/decoding.ixx new file mode 100644 index 0000000..dd1c10b --- /dev/null +++ b/code/unicode/decoding.ixx @@ -0,0 +1,13 @@ +namespace code::unicode +{ + + inline + std::uint32_t + Utf8_decoder:: + extract(std::istream& i) const + { + return decode(std::istreambuf_iterator{ i }, + std::istreambuf_iterator{}); + } + +} // namespace code::unicode diff --git a/code/unicode/decoding.test.cxx b/code/unicode/decoding.test.cxx new file mode 100644 index 0000000..91fc333 --- /dev/null +++ b/code/unicode/decoding.test.cxx @@ -0,0 +1,67 @@ +#include + +#include + +#include + +VALIDATION_TEST(test_1) +{ + std::string const encoded_0{ "\x00", 1 }; + std::string const encoded_1{ "\x7f", 1 }; + + code::unicode::Utf8_decoder utf8; + + auto decoded_0 = utf8.decode(encoded_0.begin(), encoded_0.end()); + auto decoded_1 = utf8.decode(encoded_1.begin(), encoded_1.end()); + + VALIDATION_ASSERT_EQUAL(decoded_0, 0U); + VALIDATION_ASSERT_EQUAL(decoded_1, 0x7fU); +} + +VALIDATION_TEST(test_2) +{ + std::string const encoded_0{ "\xc2\x80" }; + std::string const encoded_1{ "\xdf\xbf" }; + + code::unicode::Utf8_decoder utf8; + + auto decoded_0 = utf8.decode(encoded_0.begin(), encoded_0.end()); + auto decoded_1 = utf8.decode(encoded_1.begin(), encoded_1.end()); + + VALIDATION_ASSERT_EQUAL(decoded_0, 0x80U); + VALIDATION_ASSERT_EQUAL(decoded_1, 0x7ffU); +} + +VALIDATION_TEST(test_3) +{ + std::string const encoded_0{ "\xe0\xa0\x80" }; + std::string const encoded_1{ "\xef\xbf\xbf" }; + + code::unicode::Utf8_decoder utf8; + + auto decoded_0 = utf8.decode(encoded_0.begin(), encoded_0.end()); + auto decoded_1 = utf8.decode(encoded_1.begin(), encoded_1.end()); + + VALIDATION_ASSERT_EQUAL(decoded_0, 0x800U); + VALIDATION_ASSERT_EQUAL(decoded_1, 0xffffU); +} + +VALIDATION_TEST(test_4) +{ + std::string const encoded_0{ "\xf0\x90\x80\x80" }; + std::string const encoded_1{ "\xf4\x8f\xbf\xbf" }; + + code::unicode::Utf8_decoder utf8; + + auto decoded_0 = utf8.decode(encoded_0.begin(), encoded_0.end()); + auto decoded_1 = utf8.decode(encoded_1.begin(), encoded_1.end()); + + VALIDATION_ASSERT_EQUAL(decoded_0, 0x10000U); + VALIDATION_ASSERT_EQUAL(decoded_1, 0x10ffffU); +} + +int +main(int argc, char* argv[]) +{ + return code::validation::main(argc, argv); +} diff --git a/code/unicode/decoding.txx b/code/unicode/decoding.txx new file mode 100644 index 0000000..e3fc65b --- /dev/null +++ b/code/unicode/decoding.txx @@ -0,0 +1,70 @@ +namespace code::unicode { + + template + std::uint32_t + Utf8_decoder:: + decode(InputIterator& it, InputIterator const& end) const + { + if (it == end) + return replacement_character; + + std::uint32_t c1 = (unsigned char)*it++; + + // 1 byte (valid: 0 <= codepoint <= 0x7f) + if (c1 <= 0x7f) + return c1; + + if (it == end) + return replacement_character; + + std::uint32_t c2 = (unsigned char)*it++; + + // 2 bytes (valid: 0x80 <= codepoint <= 0x7ff) + if (c1 <= 0b11011111) { + std::uint32_t c = ((c1 & 0b00011111) << 6) | (c2 & 0b00111111); + + if (c < 0x80) // overlong protection + return replacement_character; + + return c; + } + + if (it == end) + return replacement_character; + + std::uint32_t c3 = (unsigned char)*it++; + + // 3 bytes (valid: 0x800 <= codepoint <= 0xffff) + if (c1 <= 0b11101111) { + std::uint32_t c = + ((c1 & 0b00001111) << 12) | ((c2 & 0b00111111) << 6) | (c3 & 0b00111111); + + if (c < 0x800) // overlong protection + return replacement_character; + + return c; + } + + if (it == end) + return replacement_character; + + std::uint32_t c4 = (unsigned char)*it++; + + // 4 bytes (valid: 0x10000 <= codepoint <= 0x10ffff) + if (c1 <= 0b11110111) { + std::uint32_t c = ((c1 & 0b00000111) << 18) | ((c2 & 0b00111111) << 12) | + ((c3 & 0b00111111) << 6) | (c4 & 0b00111111); + + if (c < 0x10000) // overlong protection + return replacement_character; + + if (0x10ffff < c) // overflow protection + return replacement_character; + + return c; + } + + return replacement_character; + } + +} // namespace code::unicode diff --git a/code/unicode/encoding.hxx b/code/unicode/encoding.hxx new file mode 100644 index 0000000..6a6a68a --- /dev/null +++ b/code/unicode/encoding.hxx @@ -0,0 +1,35 @@ +#ifndef code__unicode__encoding_hxx_ +#define code__unicode__encoding_hxx_ + +#include +#include + +namespace code::unicode +{ + + class Encoder + { + public: + virtual + ~Encoder() = default; + + virtual + void + encode(std::ostream& o, std::uint32_t c) const = 0; + + }; + + class Utf8_encoder + : public Encoder + { + public: + void + encode(std::ostream& o, std::uint32_t c) const override; + + }; + +} // namespace code::unicode + +#include + +#endif diff --git a/code/unicode/encoding.ixx b/code/unicode/encoding.ixx new file mode 100644 index 0000000..940c659 --- /dev/null +++ b/code/unicode/encoding.ixx @@ -0,0 +1,40 @@ +namespace code::unicode +{ + + inline + void + Utf8_encoder:: + encode(std::ostream& o, std::uint32_t c) const + { + // 1 byte + if (c <= 0x7f) { + o.put(c); + return; + } + + // 2 bytes + if (c <= 0x7FF) { + o.put(0b11000000 | ((c >> 6) & 0b00011111)); + o.put(0b10000000 | (c & 0b00111111)); + return; + } + + // 3 bytes + if (c <= 0xFFFF) { + o.put(0b11100000 | ((c >> 12) & 0b00001111)); + o.put(0b10000000 | ((c >> 6) & 0b00111111)); + o.put(0b10000000 | (c & 0b00111111)); + return; + } + + // 4 bytes + if (c <= 0x10FFFF) { + o.put(0b11110000 | ((c >> 18) & 0b00000111)); + o.put(0b10000000 | ((c >> 12) & 0b00111111)); + o.put(0b10000000 | ((c >> 6) & 0b00111111)); + o.put(0b10000000 | (c & 0b00111111)); + return; + } + } + +} // namespace code::unicode diff --git a/code/unicode/encoding.test.cxx b/code/unicode/encoding.test.cxx new file mode 100644 index 0000000..068a060 --- /dev/null +++ b/code/unicode/encoding.test.cxx @@ -0,0 +1,128 @@ +#include + +#include + +#include +#include + +VALIDATION_TEST(test_1) +{ + std::stringstream str; + + code::unicode::Utf8_encoder utf8; + utf8.encode(str, 0); + + auto string = str.str(); + + VALIDATION_ASSERT_EQUAL(string.size(), 1U); + VALIDATION_ASSERT_EQUAL(string[0], 0); +} + +VALIDATION_TEST(test_2) +{ + std::stringstream str; + + code::unicode::Utf8_encoder utf8; + utf8.encode(str, 0x7f); + + auto string = str.str(); + + VALIDATION_ASSERT_EQUAL(string.size(), 1U); + VALIDATION_ASSERT_EQUAL((unsigned char)string[0], 0x7f); +} + +VALIDATION_TEST(test_3) +{ + std::stringstream str; + + code::unicode::Utf8_encoder utf8; + utf8.encode(str, 0x80); + + auto string = str.str(); + + VALIDATION_ASSERT_EQUAL(string.size(), 2U); + VALIDATION_ASSERT_EQUAL((unsigned char)string[0], 0xc2); + VALIDATION_ASSERT_EQUAL((unsigned char)string[1], 0x80); +} + +VALIDATION_TEST(test_4) +{ + std::stringstream str; + + code::unicode::Utf8_encoder utf8; + utf8.encode(str, 0x7ff); + + auto string = str.str(); + + VALIDATION_ASSERT_EQUAL(string.size(), 2U); + VALIDATION_ASSERT_EQUAL((unsigned char)string[0], 0xdf); + VALIDATION_ASSERT_EQUAL((unsigned char)string[1], 0xbf); +} + +VALIDATION_TEST(test_5) +{ + std::stringstream str; + + code::unicode::Utf8_encoder utf8; + utf8.encode(str, 0x800); + + auto string = str.str(); + + VALIDATION_ASSERT_EQUAL(string.size(), 3U); + VALIDATION_ASSERT_EQUAL((unsigned char)string[0], 0xe0); + VALIDATION_ASSERT_EQUAL((unsigned char)string[1], 0xa0); + VALIDATION_ASSERT_EQUAL((unsigned char)string[2], 0x80); +} + +VALIDATION_TEST(test_6) +{ + std::stringstream str; + + code::unicode::Utf8_encoder utf8; + utf8.encode(str, 0xffff); + + auto string = str.str(); + + VALIDATION_ASSERT_EQUAL(string.size(), 3U); + VALIDATION_ASSERT_EQUAL((unsigned char)string[0], 0xef); + VALIDATION_ASSERT_EQUAL((unsigned char)string[1], 0xbf); + VALIDATION_ASSERT_EQUAL((unsigned char)string[2], 0xbf); +} + +VALIDATION_TEST(test_7) +{ + std::stringstream str; + + code::unicode::Utf8_encoder utf8; + utf8.encode(str, 0x10000); + + auto string = str.str(); + + VALIDATION_ASSERT_EQUAL(string.size(), 4U); + VALIDATION_ASSERT_EQUAL((unsigned char)string[0], 0xf0); + VALIDATION_ASSERT_EQUAL((unsigned char)string[1], 0x90); + VALIDATION_ASSERT_EQUAL((unsigned char)string[2], 0x80); + VALIDATION_ASSERT_EQUAL((unsigned char)string[3], 0x80); +} + +VALIDATION_TEST(test_8) +{ + std::stringstream str; + + code::unicode::Utf8_encoder utf8; + utf8.encode(str, 0x10ffff); + + auto string = str.str(); + + VALIDATION_ASSERT_EQUAL(string.size(), 4U); + VALIDATION_ASSERT_EQUAL((unsigned char)string[0], 0xf4); + VALIDATION_ASSERT_EQUAL((unsigned char)string[1], 0x8f); + VALIDATION_ASSERT_EQUAL((unsigned char)string[2], 0xbf); + VALIDATION_ASSERT_EQUAL((unsigned char)string[3], 0xbf); +} + +int +main(int argc, char* argv[]) +{ + return code::validation::main(argc, argv); +} diff --git a/code/unicode/iterator.hxx b/code/unicode/iterator.hxx new file mode 100644 index 0000000..35108ee --- /dev/null +++ b/code/unicode/iterator.hxx @@ -0,0 +1,124 @@ +#ifndef code__unicode__iterator_hxx_ +#define code__unicode__iterator_hxx_ + +#include + +#include + +namespace code::unicode +{ + + template + class Unicode_input_iterator + { + public: + using decoder_type = Decoder; + using iterator_type = IteratorType; + using value_type = std::uint32_t; + + // TODO standard member types + + Unicode_input_iterator() + {} + + explicit Unicode_input_iterator(iterator_type it) : it_{ std::move(it) } + { + decode(); + } + + Unicode_input_iterator(iterator_type it, iterator_type end) + : it_{ std::move(it) }, end_{ std::move(end) } + { + decode(); + } + + Unicode_input_iterator(decoder_type decoder, iterator_type it) + : decoder_{ std::move(decoder) }, it_{ std::move(it) } + { + decode(); + } + + Unicode_input_iterator(decoder_type decoder, + iterator_type it, + iterator_type end) + : decoder_{ std::move(decoder) }, + it_{ std::move(it) }, + end_{ std::move(end) } + { + decode(); + } + + value_type operator*() const + { + return unicode_; + } + + Unicode_input_iterator& + operator++() + { + decode(); + return *this; + } + + struct Proxy + { + value_type unicode_; + + value_type operator*() const + { + return unicode_; + } + }; + + Proxy + operator++(int) + { + auto unicode = unicode_; + decode(); + return Proxy{unicode}; + } + + bool + equal(iterator_type const& other) const + { + return it_ == other; + } + + private: + void + decode() + { + if (it_ != end_) + unicode_ = decoder_.decode(it_, end_); + } + + decoder_type decoder_; + iterator_type it_; + iterator_type end_; + std::uint32_t unicode_{}; + }; + + template + using Utf8_input_iterator = Unicode_input_iterator; + + template + bool + operator==( + Unicode_input_iterator const& lhs, + typename Unicode_input_iterator::iterator_type const& rhs) + { + return lhs.equal(rhs); + } + + template + bool + operator!=( + Unicode_input_iterator const& lhs, + typename Unicode_input_iterator< Decoder, IteratorType >::iterator_type const& rhs) + { + return !lhs.equal(rhs); + } + +} // namespace code::unicode + +#endif diff --git a/code/unicode/iterator.test.cxx b/code/unicode/iterator.test.cxx new file mode 100644 index 0000000..cb18110 --- /dev/null +++ b/code/unicode/iterator.test.cxx @@ -0,0 +1,33 @@ +#include + +#include + +#include +#include + +VALIDATION_TEST(test_1) +{ + std::string utf8{"h€ࠀ𐀀"}; + + code::unicode::Utf8_input_iterator it{ + utf8.begin(), utf8.end() + }; + + std::uint32_t c1 = *it++; + std::uint32_t c2 = *it++; + std::uint32_t c3 = *it++; + std::uint32_t c4 = *it++; + + VALIDATION_ASSERT_EQUAL(it, utf8.end()); + + VALIDATION_ASSERT_EQUAL(c1, (std::uint32_t)'h'); + VALIDATION_ASSERT_EQUAL(c2, 0x80U); + VALIDATION_ASSERT_EQUAL(c3, 0x800U); + VALIDATION_ASSERT_EQUAL(c4, 0x10000U); +} + +int +main(int argc, char* argv[]) +{ + return code::validation::main(argc, argv); +} diff --git a/code/unicode/iterator.txx b/code/unicode/iterator.txx new file mode 100644 index 0000000..23e8b66 --- /dev/null +++ b/code/unicode/iterator.txx @@ -0,0 +1,21 @@ +namespace unicode { + +template< typename Decoder, typename IteratorType > +unicode_input_iterator< Decoder, InputIterator >::unicode_input_iterator( + iterator_type&& it, + iterator_type&& end) +{} + +template< typename Decoder, typename IteratorType > +unicode_input_iterator< Decoder, IteratorType >::value_type + unicode_input_iterator< Decoder, IteratorType >::operator*() const +{} + +bool +unicode_input_iterator< Decoder, IteratorType >::equal( + unicode_input_iterator const& other) const +{ + return is_eof() == other.is_eof(); +} + +} // namespace unicode diff --git a/code/unicode/unicode.hxx b/code/unicode/unicode.hxx new file mode 100644 index 0000000..6dbd9f0 --- /dev/null +++ b/code/unicode/unicode.hxx @@ -0,0 +1,13 @@ +#ifndef code__unicode__unicode_hxx_ +#define code__unicode__unicode_hxx_ + +#include + +namespace code::unicode +{ + + std::uint32_t constexpr replacement_character = 0xFFFD; + +} // namespace code::unicode + +#endif diff --git a/code/unicode/version.hxx.in b/code/unicode/version.hxx.in new file mode 100644 index 0000000..d5261dd --- /dev/null +++ b/code/unicode/version.hxx.in @@ -0,0 +1,34 @@ +#pragma once + +// The numeric version format is AAAAABBBBBCCCCCDDDE where: +// +// AAAAA - major version number +// BBBBB - minor version number +// CCCCC - bugfix version number +// DDD - alpha / beta (DDD + 500) version number +// E - final (0) / snapshot (1) +// +// When DDDE is not 0, 1 is subtracted from AAAAABBBBBCCCCC. For example: +// +// Version AAAAABBBBBCCCCCDDDE +// +// 0.1.0 0000000001000000000 +// 0.1.2 0000000001000020000 +// 1.2.3 0000100002000030000 +// 2.2.0-a.1 0000200001999990010 +// 3.0.0-b.2 0000299999999995020 +// 2.2.0-a.1.z 0000200001999990011 +// +#define LIBCODE_UNICODE_VERSION $libcode_unicode.version.project_number$ULL +#define LIBCODE_UNICODE_VERSION_STR "$libcode_unicode.version.project$" +#define LIBCODE_UNICODE_VERSION_ID "$libcode_unicode.version.project_id$" +#define LIBCODE_UNICODE_VERSION_FULL "$libcode_unicode.version$" + +#define LIBCODE_UNICODE_VERSION_MAJOR $libcode_unicode.version.major$ +#define LIBCODE_UNICODE_VERSION_MINOR $libcode_unicode.version.minor$ +#define LIBCODE_UNICODE_VERSION_PATCH $libcode_unicode.version.patch$ + +#define LIBCODE_UNICODE_PRE_RELEASE $libcode_unicode.version.pre_release$ + +#define LIBCODE_UNICODE_SNAPSHOT_SN $libcode_unicode.version.snapshot_sn$ULL +#define LIBCODE_UNICODE_SNAPSHOT_ID "$libcode_unicode.version.snapshot_id$" diff --git a/manifest b/manifest new file mode 100644 index 0000000..c0df4f6 --- /dev/null +++ b/manifest @@ -0,0 +1,12 @@ +: 1 +name: libcode-unicode +version: 0.1.0-a.0.z +language: c++ +summary: libcode-unicode C++ library +license: BSD-4-Clause +description-file: README.md +url: https://helloryan.se/code/ +email: ryan@helloryan.se +depends: * build2 >= 0.17.0 +depends: * bpkg >= 0.17.0 +depends: libcode-validation ^0.1.0- diff --git a/repositories.manifest b/repositories.manifest new file mode 100644 index 0000000..d6eca4a --- /dev/null +++ b/repositories.manifest @@ -0,0 +1,6 @@ +: 1 +summary: libcode-unicode project repository + +: +role: prerequisite +location: https://code.helloryan.se/code/libcode-validation.git##HEAD diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 0000000..662178d --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1,8 @@ +# Test executables. +# +driver + +# Testscript output directories (can be symlinks). +# +test +test-* diff --git a/tests/build/.gitignore b/tests/build/.gitignore new file mode 100644 index 0000000..974e01d --- /dev/null +++ b/tests/build/.gitignore @@ -0,0 +1,4 @@ +/config.build +/root/ +/bootstrap/ +build/ diff --git a/tests/build/bootstrap.build b/tests/build/bootstrap.build new file mode 100644 index 0000000..a07b5ea --- /dev/null +++ b/tests/build/bootstrap.build @@ -0,0 +1,5 @@ +project = # Unnamed tests subproject. + +using config +using test +using dist diff --git a/tests/build/root.build b/tests/build/root.build new file mode 100644 index 0000000..a67b2fe --- /dev/null +++ b/tests/build/root.build @@ -0,0 +1,16 @@ +cxx.std = latest + +using cxx + +hxx{*}: extension = hxx +ixx{*}: extension = ixx +txx{*}: extension = txx +cxx{*}: extension = cxx + +# Every exe{} in this subproject is by default a test. +# +exe{*}: test = true + +# The test target for cross-testing (running tests under Wine, etc). +# +test.target = $cxx.target diff --git a/tests/buildfile b/tests/buildfile new file mode 100644 index 0000000..aeeab15 --- /dev/null +++ b/tests/buildfile @@ -0,0 +1 @@ +./: {*/ -build/}