commit 01a635f3bfa8159865c856912afa719286cfbbd8 Author: Ryan Date: Sat Oct 18 00:37:12 2025 +0200 Hello libart-unicode diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..ec9f3de --- /dev/null +++ b/.editorconfig @@ -0,0 +1,17 @@ +root = true + +[*] +indent_style = space +indent_size = 2 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.md] +indent_size = 4 +max_line_length = off +trim_trailing_whitespace = false + +[*.yaml] +indent_size = 2 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..176a458 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +* text=auto diff --git a/.gitea/workflows/on-push.yaml b/.gitea/workflows/on-push.yaml new file mode 100644 index 0000000..c4c2eb2 --- /dev/null +++ b/.gitea/workflows/on-push.yaml @@ -0,0 +1,31 @@ +name: on-push + +on: + push: + tags-ignore: + - '*' + branches: + - '**' + +jobs: + build-and-test: + runs-on: linux + container: code.helloryan.se/art/infra/buildenv/x86_64-fedora_42-unified:latest + volumes: + - /build + steps: + - name: Configure repository access + run: | + git config --global http.$GITHUB_SERVER_URL/.extraheader "Authorization: token ${{ secrets.ACT_RUNNER_TOKEN }}" + - name: Configure build directory + run: | + bpkg create -d /build cc config.cxx=clang++ config.cc.coptions="-Wall -Werror -Wno-unknown-pragmas" + - name: Build package + run: | + cd /build + bpkg build --yes --trust-yes $GITHUB_SERVER_URL/$GITHUB_REPOSITORY.git##$GITHUB_SHA + - name: Test package + run: | + cd /build + b test + diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..8055483 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1 @@ +patreon: helloryan diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c96e1ec --- /dev/null +++ b/.gitignore @@ -0,0 +1,31 @@ +.bdep/ + +# Local default options files. +# +.build2/local/ + +# Compiler/linker output. +# +*.d +*.t +*.i +*.i.* +*.ii +*.ii.* +*.o +*.obj +*.gcm +*.pcm +*.ifc +*.so +*.dylib +*.dll +*.a +*.lib +*.exp +*.pdb +*.ilk +*.exe +*.exe.dlls/ +*.exe.manifest +*.pc diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..dfc745b --- /dev/null +++ b/LICENSE @@ -0,0 +1,31 @@ +Copyright © 2024 Ryan. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. All advertising materials mentioning features or use of this software must + display the following acknowledgement: + + This product includes software developed by Ryan, http://helloryan.se/. + +4. Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDER "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN +NO EVENT SHALL COPYRIGHT HOLDER BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..41746d5 --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +# libart-unicode + +![Build badge](https://code.helloryan.se/art/libart-unicode/actions/workflows/on-push.yaml/badge.svg) + +libart-unicode is a Unicode encoding/decoding library for C++. + +## Sponsorship + +You can sponsor the development of this project via Patreon. Read more +over at https://patreon.com/helloryan. diff --git a/art/unicode/.gitignore b/art/unicode/.gitignore new file mode 100644 index 0000000..b1ed0e0 --- /dev/null +++ b/art/unicode/.gitignore @@ -0,0 +1,9 @@ +# Generated version header. +# +version.hxx + +# Unit test executables and Testscript output directories +# (can be symlinks). +# +*.test +test-*.test diff --git a/art/unicode/buildfile b/art/unicode/buildfile new file mode 100644 index 0000000..9f0adb4 --- /dev/null +++ b/art/unicode/buildfile @@ -0,0 +1,65 @@ +intf_libs = # Interface dependencies. +impl_libs = # Implementation dependencies. + +./: lib{art-unicode}: libul{art-unicode} + +libul{art-unicode}: {hxx ixx txx cxx}{** -**.test... -version} \ + {hxx }{ version} + +libul{art-unicode}: $impl_libs $intf_libs + +# Unit tests. +# +exe{*.test}: +{ + test = true + install = false +} + +test_libs = +import test_libs =+ libart-validation%lib{art-validation} + +for t: cxx{**.test...} +{ + d = $directory($t) + n = $name($t)... + + ./: $d/exe{$n}: $t $d/{hxx ixx txx}{+$n} $d/testscript{+$n} $test_libs + $d/exe{$n}: libul{art-unicode}: bin.whole = false +} + +hxx{version}: in{version} $src_root/manifest +{ + dist = true + clean = ($src_root != $out_root) +} + +# Build options. +# +cxx.poptions =+ "-I$out_root" "-I$src_root" + +# Export options. +# +lib{art-unicode}: +{ + cxx.export.poptions = "-I$out_root" "-I$src_root" + cxx.export.libs = $intf_libs +} + +# For pre-releases use the complete version to make sure they cannot +# be used in place of another pre-release or the final version. See +# the version module for details on the version.* variable values. +# +if $version.pre_release + lib{art-unicode}: bin.lib.version = "-$version.project_id" +else + lib{art-unicode}: bin.lib.version = "-$version.major.$version.minor" + +# Install into the art/unicode/ subdirectory of, say, /usr/include/ +# recreating subdirectories. +# +{hxx ixx txx}{*}: +{ + install = include/art/unicode/ + install.subdirs = true +} diff --git a/art/unicode/decoder.cxx b/art/unicode/decoder.cxx new file mode 100644 index 0000000..6bb2921 --- /dev/null +++ b/art/unicode/decoder.cxx @@ -0,0 +1,28 @@ +#include + +namespace art::unicode +{ + + decoder_t:: + ~decoder_t() noexcept + {} + + decoder_t::iterator_t + decoder_t:: + begin() + { + return iterator_t{this}; + } + + decoder_t::iterator_t + decoder_t:: + end() + { + return iterator_t{}; + } + + decoder_t:: + decoder_t() + {} + +} // namespace art::unicode diff --git a/art/unicode/decoder.hxx b/art/unicode/decoder.hxx new file mode 100644 index 0000000..b2358a2 --- /dev/null +++ b/art/unicode/decoder.hxx @@ -0,0 +1,160 @@ +#ifndef art__unicode__decoder_hxx_ +#define art__unicode__decoder_hxx_ + +#include +#include +#include + +namespace art::unicode +{ + + constexpr uint32_t const replacement_character{0xFFFD}; + + class decoder_t + { + public: + virtual + ~decoder_t() noexcept; + + class iterator_t; + friend iterator_t; + + iterator_t + begin(); + + iterator_t + end(); + + virtual + uint32_t + decode() = 0; + + protected: + /// Constructor. + /// + decoder_t(); + + /// Copy-construction is prohibited. + /// + decoder_t(decoder_t const&) = delete; + + /// Move-construction is prohibited. + /// + decoder_t(decoder_t&&) = delete; + + virtual + bool + at_end() = 0; + + /// Copy-assignment is prohibited. + /// + decoder_t& + operator=(decoder_t const&) = delete; + + /// Move-assignment is prohibited. + /// + decoder_t& + operator=(decoder_t&&) = delete; + + }; + + /// Decoder iterator. + /// + class decoder_t::iterator_t + { + public: + using value_type = uint32_t; + using pointer = uint32_t const*; + using reference = uint32_t const&; + using difference_type = std::ptrdiff_t; + using iterator_category = std::input_iterator_tag; + + /// Constructor. + /// + iterator_t() = default; + + /// Constructor. + /// + iterator_t(iterator_t const&) = default; + + /// Constructor. + /// + iterator_t(iterator_t&&) = default; + + /// Destructor. + /// + ~iterator_t() noexcept = default; + + /// Assignment. + /// + iterator_t& + operator=(iterator_t const&) = default; + + /// Assignment. + /// + iterator_t& + operator=(iterator_t&&) = default; + + reference + operator*() const + { + return _codepoint; + } + + pointer + operator->() const + { + return &_codepoint; + } + + iterator_t& + operator++() + { + next(); + return *this; + } + + /// Compare equality. + /// + bool + operator==(iterator_t const& other) const + { + return _decoder == other._decoder; + } + + /// Compare inequality. + /// + bool + operator!=(iterator_t const& other) const + { + return !(*this == other); + } + + private: + friend decoder_t; + + iterator_t(decoder_t* d) + : _decoder{d} + { + next(); + } + + void + next() + { + if (_decoder->at_end()) { + _decoder = nullptr; + } + else { + _codepoint = _decoder->decode(); + } + } + + decoder_t* _decoder{}; + uint32_t _codepoint; + + }; + +} // namespace art::unicode + +#endif diff --git a/art/unicode/encoder.cxx b/art/unicode/encoder.cxx new file mode 100644 index 0000000..79abd03 --- /dev/null +++ b/art/unicode/encoder.cxx @@ -0,0 +1,10 @@ +#include + +namespace art::unicode +{ + + encoder_t:: + encoder_t() + {} + +} // namespace art::unicode diff --git a/art/unicode/encoder.hxx b/art/unicode/encoder.hxx new file mode 100644 index 0000000..957eb1a --- /dev/null +++ b/art/unicode/encoder.hxx @@ -0,0 +1,45 @@ +#ifndef art__unicode__encoder_hxx_ +#define art__unicode__encoder_hxx_ + +#include + +namespace art::unicode +{ + + /// Abstract encoder. + /// + class encoder_t + { + public: + virtual + void + encode(std::uint32_t) = 0; + + protected: + /// Constructor. + /// + encoder_t(); + + /// Copy-construction is prohibited. + /// + encoder_t(encoder_t const&) = delete; + + /// Move-construction is prohibited. + /// + encoder_t(encoder_t&&) = delete; + + /// Copy-assignment is prohibited. + /// + encoder_t& + operator=(encoder_t const&) = delete; + + /// Move-assignment is prohibited. + /// + encoder_t& + operator=(encoder_t&&) = delete; + + }; + +} // namespace art::unicode + +#endif diff --git a/art/unicode/reader.cxx b/art/unicode/reader.cxx new file mode 100644 index 0000000..0563caa --- /dev/null +++ b/art/unicode/reader.cxx @@ -0,0 +1,23 @@ +#include + +namespace art::unicode +{ + + reader_t:: + ~reader_t() noexcept + {} + + std::uint8_t + reader_t:: + advance() + { + auto byte = get(); + next(); + return byte; + } + + reader_t:: + reader_t() + {} + +} // namespace art::unicode diff --git a/art/unicode/reader.hxx b/art/unicode/reader.hxx new file mode 100644 index 0000000..7b452d4 --- /dev/null +++ b/art/unicode/reader.hxx @@ -0,0 +1,107 @@ +#ifndef art__unicode__reader_hxx_ +#define art__unicode__reader_hxx_ + +#include +#include + +namespace art::unicode +{ + + /// Abstract reader. + /// + class reader_t + { + public: + /// Destructor. + /// + virtual + ~reader_t() noexcept; + + std::uint8_t + advance(); + + virtual + bool + at_end() const = 0; + + virtual + uint8_t + get() const = 0; + + virtual + void + next() = 0; + + protected: + /// Constructor. + /// + reader_t(); + + /// Copy-construction is prohibited. + /// + reader_t(reader_t const&) = delete; + + /// Move-construction is prohibited. + /// + reader_t(reader_t&&) = delete; + + /// Copy-assignment is prohibited. + /// + reader_t& + operator=(reader_t const&) = delete; + + /// Move-assignment is prohibited. + /// + reader_t& + operator=(reader_t&&) = delete; + + }; + + template + class iterator_reader_t + : public reader_t + { + public: + using input_iterator = I; + using end_iterator = E; + + iterator_reader_t(input_iterator input, end_iterator end) + : _current{input}, + _end{end} + {} + + bool + at_end() const override + { + return _current == _end; + } + + uint8_t + get() const override + { + if (at_end()) { + throw std::logic_error{"iterator at end"}; + } + + return *_current; + } + + void + next() override + { + if (at_end()) { + throw std::logic_error{"iterator at end"}; + } + + ++_current; + } + + private: + input_iterator _current; + end_iterator const _end; + + }; + +} // namespace art::unicode + +#endif diff --git a/art/unicode/reader.test.cxx b/art/unicode/reader.test.cxx new file mode 100644 index 0000000..8db8ea9 --- /dev/null +++ b/art/unicode/reader.test.cxx @@ -0,0 +1,42 @@ +#include + +#include + +#include + +VALIDATION_TEST(test_XYZ) +{ + std::string str{"XYZ"}; + + art::unicode::iterator_reader_t reader{ + str.begin(), + str.end() + }; + + VALIDATION_ASSERT_FALSE(reader.at_end()); + + auto b1 = reader.get(); + reader.next(); + + VALIDATION_ASSERT_FALSE(reader.at_end()); + + auto b2 = reader.get(); + reader.next(); + + VALIDATION_ASSERT_FALSE(reader.at_end()); + + auto b3 = reader.get(); + reader.next(); + + VALIDATION_ASSERT_TRUE(reader.at_end()); + + VALIDATION_ASSERT_EQUAL(b1, 'X'); + VALIDATION_ASSERT_EQUAL(b2, 'Y'); + VALIDATION_ASSERT_EQUAL(b3, 'Z'); +} + +int +main(int argc, char* argv[]) +{ + return art::validation::main(argc, argv); +} diff --git a/art/unicode/utf8-decoder.cxx b/art/unicode/utf8-decoder.cxx new file mode 100644 index 0000000..25c04d3 --- /dev/null +++ b/art/unicode/utf8-decoder.cxx @@ -0,0 +1,106 @@ +#include + +namespace art::unicode +{ + + utf8_decoder_t:: + utf8_decoder_t(reader_t& reader) + : _reader{reader} + {} + + uint32_t + utf8_decoder_t:: + decode() + { + if (at_end()) { + return replacement_character; + } + + uint32_t c1 = _reader.advance(); + + // 1 byte (valid: 0 <= value <= 0x7f) + // + if (c1 <= 0x7f) { + return c1; + } + + if (at_end()) { + return replacement_character; + } + + std::uint32_t c2 = _reader.advance(); + + // 2 bytes (valid: 0x80 <= value <= 0x7ff) + // + if (c1 <= 0b11011111) { + std::uint32_t c = ((c1 & 0b00011111) << 6) | (c2 & 0b00111111); + + // Overlong protection. + // + if (c < 0x80) { + return replacement_character; + } + + return c; + } + + if (at_end()) { + return replacement_character; + } + + std::uint32_t c3 = _reader.advance(); + + // 3 bytes (valid: 0x800 <= value <= 0xffff) + // + if (c1 <= 0b11101111) { + std::uint32_t c = ((c1 & 0b00001111) << 12) | ((c2 & 0b00111111) << 6) | (c3 & 0b00111111); + + // Overlong protection. + // + if (c < 0x800) { + return replacement_character; + } + + return c; + } + + if (at_end()) { + return replacement_character; + } + + std::uint32_t c4 = _reader.advance(); + + // 4 bytes (valid: 0x10000 <= value <= 0x10ffff) + // + if (c1 <= 0b11110111) { + std::uint32_t c = ((c1 & 0b00000111) << 18) | ((c2 & 0b00111111) << 12) | + ((c3 & 0b00111111) << 6) | (c4 & 0b00111111); + + // Overlong protection. + // + if (c < 0x10000) { + return replacement_character; + } + + // Overflow protection. + // + if (0x10ffff < c) { + return replacement_character; + } + + return c; + } + + // Invalid. + // + return replacement_character; + } + + bool + utf8_decoder_t:: + at_end() + { + return _reader.at_end(); + } + +} // namespace art::unicode diff --git a/art/unicode/utf8-decoder.hxx b/art/unicode/utf8-decoder.hxx new file mode 100644 index 0000000..810b119 --- /dev/null +++ b/art/unicode/utf8-decoder.hxx @@ -0,0 +1,53 @@ +#ifndef art__unicode__utf8_decoder_hxx_ +#define art__unicode__utf8_decoder_hxx_ + +#include +#include + +namespace art::unicode +{ + + /// UTF-8 decoder. + /// + class utf8_decoder_t + : public decoder_t + { + public: + /// Constructor. + /// + explicit + utf8_decoder_t(reader_t&); + + /// Copy-construction is prohibited. + /// + utf8_decoder_t(utf8_decoder_t const&) = delete; + + /// Move-construction is prohibited. + /// + utf8_decoder_t(utf8_decoder_t&&) = delete; + + uint32_t + decode() override; + + /// Copy-assignment is prohibited. + /// + utf8_decoder_t& + operator=(utf8_decoder_t const&) = delete; + + /// Move-assignment is prohibited. + /// + utf8_decoder_t& + operator=(utf8_decoder_t&&) = delete; + + protected: + bool + at_end() override; + + private: + reader_t& _reader; + + }; + +} // namespace art::unicode + +#endif diff --git a/art/unicode/utf8-decoder.test.cxx b/art/unicode/utf8-decoder.test.cxx new file mode 100644 index 0000000..4aa0d9f --- /dev/null +++ b/art/unicode/utf8-decoder.test.cxx @@ -0,0 +1,79 @@ +#include +#include + +#include + +#include + +using namespace art::unicode; + +static +uint32_t +decode_one(std::string const& str) +{ + iterator_reader_t reader{str.begin(), str.end()}; + utf8_decoder_t decoder{reader}; + + return decoder.decode(); +} + +VALIDATION_TEST(test_1) +{ + std::string const encoded_0{ "\x00", 1 }; + std::string const encoded_1{ "\x7f", 1 }; + + auto decoded_0 = decode_one(encoded_0); + auto decoded_1 = decode_one(encoded_1); + + VALIDATION_ASSERT_EQUAL(decoded_0, 0U); + VALIDATION_ASSERT_EQUAL(decoded_1, 0x7fU); +} + +VALIDATION_TEST(test_2) +{ + std::string const encoded_0{ "\xc2\x80" }; + std::string const encoded_1{ "\xdf\xbf" }; + + auto decoded_0 = decode_one(encoded_0); + auto decoded_1 = decode_one(encoded_1); + + VALIDATION_ASSERT_EQUAL(decoded_0, 0x80U); + VALIDATION_ASSERT_EQUAL(decoded_1, 0x7ffU); +} + +VALIDATION_TEST(test_3) +{ + std::string const encoded_0{ "\xe0\xa0\x80" }; + std::string const encoded_1{ "\xef\xbf\xbf" }; + + auto decoded_0 = decode_one(encoded_0); + auto decoded_1 = decode_one(encoded_1); + + VALIDATION_ASSERT_EQUAL(decoded_0, 0x800U); + VALIDATION_ASSERT_EQUAL(decoded_1, 0xffffU); +} + +VALIDATION_TEST(test_4) +{ + std::string const encoded_0{ "\xf0\x90\x80\x80" }; + std::string const encoded_1{ "\xf4\x8f\xbf\xbf" }; + + auto decoded_0 = decode_one(encoded_0); + auto decoded_1 = decode_one(encoded_1); + + VALIDATION_ASSERT_EQUAL(decoded_0, 0x10000U); + VALIDATION_ASSERT_EQUAL(decoded_1, 0x10ffffU); +} + +VALIDATION_TEST(overlong) +{ + std::string const encoded{"\xc0\xaf"}; + auto decoded = decode_one(encoded); + VALIDATION_ASSERT_EQUAL(decoded, 0xFFFDUL); +} + +int +main(int argc, char* argv[]) +{ + return art::validation::main(argc, argv); +} diff --git a/art/unicode/utf8-encoder.cxx b/art/unicode/utf8-encoder.cxx new file mode 100644 index 0000000..2d1ae24 --- /dev/null +++ b/art/unicode/utf8-encoder.cxx @@ -0,0 +1,54 @@ +#include + +namespace art::unicode +{ + + utf8_encoder_t:: + utf8_encoder_t(writer_t& writer) + : _writer{writer} + {} + + void + utf8_encoder_t:: + encode(uint32_t codepoint) + { + // 1 byte. + // + if (codepoint <= 0x7f) { + _writer.write(codepoint); + return; + } + + // 2 bytes. + // + if (codepoint <= 0x7FF) { + _writer.write(0b11000000 | ((codepoint >> 6) & 0b00011111)); + _writer.write(0b10000000 | (codepoint & 0b00111111)); + return; + } + + // 3 bytes. + // + if (codepoint <= 0xFFFF) { + _writer.write(0b11100000 | ((codepoint >> 12) & 0b00001111)); + _writer.write(0b10000000 | ((codepoint >> 6) & 0b00111111)); + _writer.write(0b10000000 | (codepoint & 0b00111111)); + return; + } + + // 4 bytes. + // + if (codepoint <= 0x10FFFF) { + _writer.write(0b11110000 | ((codepoint >> 18) & 0b00000111)); + _writer.write(0b10000000 | ((codepoint >> 12) & 0b00111111)); + _writer.write(0b10000000 | ((codepoint >> 6) & 0b00111111)); + _writer.write(0b10000000 | (codepoint & 0b00111111)); + return; + } + + // fixme: throw something else. + // + throw 0; + } + +} // namespace art::unicode diff --git a/art/unicode/utf8-encoder.hxx b/art/unicode/utf8-encoder.hxx new file mode 100644 index 0000000..130a34f --- /dev/null +++ b/art/unicode/utf8-encoder.hxx @@ -0,0 +1,49 @@ +#ifndef art__unicode__utf8_encoder_hxx_ +#define art__unicode__utf8_encoder_hxx_ + +#include +#include + +namespace art::unicode +{ + + class utf8_encoder_t + : public encoder_t + { + public: + /// Constructor. + /// + explicit + utf8_encoder_t(writer_t&); + + /// Copy-construction is prohibited. + /// + utf8_encoder_t(utf8_encoder_t const&) = delete; + + /// Move-construction is prohibited. + /// + utf8_encoder_t(utf8_encoder_t&&) = delete; + + /// Encode Unicode code point. + /// + void + encode(uint32_t) override; + + /// Copy-assignment is prohibited. + /// + utf8_encoder_t& + operator=(utf8_encoder_t const&) = delete; + + /// Move-assignment is prohibited. + /// + utf8_encoder_t& + operator=(utf8_encoder_t&&) = delete; + + private: + writer_t& _writer; + + }; + +} // namespace art::unicode + +#endif diff --git a/art/unicode/utf8-encoder.test.cxx b/art/unicode/utf8-encoder.test.cxx new file mode 100644 index 0000000..e3406fc --- /dev/null +++ b/art/unicode/utf8-encoder.test.cxx @@ -0,0 +1,121 @@ +#include +#include + +#include + +#include + + +VALIDATION_TEST(test_1) +{ + std::string str; + + art::unicode::string_writer_t writer{str}; + art::unicode::utf8_encoder_t encoder{writer}; + encoder.encode(0); + + VALIDATION_ASSERT_EQUAL(str.size(), 1U); + VALIDATION_ASSERT_EQUAL(str[0], 0); +} + +VALIDATION_TEST(test_2) +{ + std::string str; + + art::unicode::string_writer_t writer{str}; + art::unicode::utf8_encoder_t encoder{writer}; + encoder.encode(0x7f); + + VALIDATION_ASSERT_EQUAL(str.size(), 1U); + VALIDATION_ASSERT_EQUAL((unsigned char)str[0], 0x7f); +} + +VALIDATION_TEST(test_3) +{ + std::string str; + + art::unicode::string_writer_t writer{str}; + art::unicode::utf8_encoder_t encoder{writer}; + encoder.encode(0x80); + + VALIDATION_ASSERT_EQUAL(str.size(), 2U); + VALIDATION_ASSERT_EQUAL((unsigned char)str[0], 0xc2); + VALIDATION_ASSERT_EQUAL((unsigned char)str[1], 0x80); +} + +VALIDATION_TEST(test_4) +{ + std::string str; + + art::unicode::string_writer_t writer{str}; + art::unicode::utf8_encoder_t encoder{writer}; + encoder.encode(0x7ff); + + VALIDATION_ASSERT_EQUAL(str.size(), 2U); + VALIDATION_ASSERT_EQUAL((unsigned char)str[0], 0xdf); + VALIDATION_ASSERT_EQUAL((unsigned char)str[1], 0xbf); +} + +VALIDATION_TEST(test_5) +{ + std::string str; + + art::unicode::string_writer_t writer{str}; + art::unicode::utf8_encoder_t encoder{writer}; + encoder.encode(0x800); + + VALIDATION_ASSERT_EQUAL(str.size(), 3U); + VALIDATION_ASSERT_EQUAL((unsigned char)str[0], 0xe0); + VALIDATION_ASSERT_EQUAL((unsigned char)str[1], 0xa0); + VALIDATION_ASSERT_EQUAL((unsigned char)str[2], 0x80); +} + +VALIDATION_TEST(test_6) +{ + std::string str; + + art::unicode::string_writer_t writer{str}; + art::unicode::utf8_encoder_t encoder{writer}; + encoder.encode(0xffff); + + VALIDATION_ASSERT_EQUAL(str.size(), 3U); + VALIDATION_ASSERT_EQUAL((unsigned char)str[0], 0xef); + VALIDATION_ASSERT_EQUAL((unsigned char)str[1], 0xbf); + VALIDATION_ASSERT_EQUAL((unsigned char)str[2], 0xbf); +} + +VALIDATION_TEST(test_7) +{ + std::string str; + + art::unicode::string_writer_t writer{str}; + art::unicode::utf8_encoder_t encoder{writer}; + encoder.encode(0x10000); + + VALIDATION_ASSERT_EQUAL(str.size(), 4U); + VALIDATION_ASSERT_EQUAL((unsigned char)str[0], 0xf0); + VALIDATION_ASSERT_EQUAL((unsigned char)str[1], 0x90); + VALIDATION_ASSERT_EQUAL((unsigned char)str[2], 0x80); + VALIDATION_ASSERT_EQUAL((unsigned char)str[3], 0x80); +} + +VALIDATION_TEST(test_8) +{ + std::string str; + + art::unicode::string_writer_t writer{str}; + art::unicode::utf8_encoder_t encoder{writer}; + encoder.encode(0x10ffff); + + VALIDATION_ASSERT_EQUAL(str.size(), 4U); + VALIDATION_ASSERT_EQUAL((unsigned char)str[0], 0xf4); + VALIDATION_ASSERT_EQUAL((unsigned char)str[1], 0x8f); + VALIDATION_ASSERT_EQUAL((unsigned char)str[2], 0xbf); + VALIDATION_ASSERT_EQUAL((unsigned char)str[3], 0xbf); +} + +int +main(int argc, char* argv[]) +{ + return art::validation::main(argc, argv); +} diff --git a/art/unicode/version.hxx.in b/art/unicode/version.hxx.in new file mode 100644 index 0000000..b5400b9 --- /dev/null +++ b/art/unicode/version.hxx.in @@ -0,0 +1,34 @@ +#pragma once + +// The numeric version format is AAAAABBBBBCCCCCDDDE where: +// +// AAAAA - major version number +// BBBBB - minor version number +// CCCCC - bugfix version number +// DDD - alpha / beta (DDD + 500) version number +// E - final (0) / snapshot (1) +// +// When DDDE is not 0, 1 is subtracted from AAAAABBBBBCCCCC. For example: +// +// Version AAAAABBBBBCCCCCDDDE +// +// 0.1.0 0000000001000000000 +// 0.1.2 0000000001000020000 +// 1.2.3 0000100002000030000 +// 2.2.0-a.1 0000200001999990010 +// 3.0.0-b.2 0000299999999995020 +// 2.2.0-a.1.z 0000200001999990011 +// +#define LIBART_UNICODE_VERSION $libart_unicode.version.project_number$ULL +#define LIBART_UNICODE_VERSION_STR "$libart_unicode.version.project$" +#define LIBART_UNICODE_VERSION_ID "$libart_unicode.version.project_id$" +#define LIBART_UNICODE_VERSION_FULL "$libart_unicode.version$" + +#define LIBART_UNICODE_VERSION_MAJOR $libart_unicode.version.major$ +#define LIBART_UNICODE_VERSION_MINOR $libart_unicode.version.minor$ +#define LIBART_UNICODE_VERSION_PATCH $libart_unicode.version.patch$ + +#define LIBART_UNICODE_PRE_RELEASE $libart_unicode.version.pre_release$ + +#define LIBART_UNICODE_SNAPSHOT_SN $libart_unicode.version.snapshot_sn$ULL +#define LIBART_UNICODE_SNAPSHOT_ID "$libart_unicode.version.snapshot_id$" diff --git a/art/unicode/writer.hxx b/art/unicode/writer.hxx new file mode 100644 index 0000000..c31719b --- /dev/null +++ b/art/unicode/writer.hxx @@ -0,0 +1,97 @@ +#ifndef art__unicode__writer_hxx_ +#define art__unicode__writer_hxx_ + +#include +#include + +namespace art::unicode +{ + + /// Abstract writer. + /// + class writer_t + { + public: + /// Destructor. + /// + virtual + ~writer_t() noexcept = default; + + /// Write byte. + /// + virtual + void + write(std::uint8_t) = 0; + + protected: + /// Constructor. + /// + writer_t() = default; + + /// Copy-construction is prohibited. + /// + writer_t(writer_t const&) = delete; + + /// Move-construction is prohibited. + /// + writer_t(writer_t&&) = delete; + + /// Copy-assignment is prohibited. + /// + writer_t& + operator=(writer_t const&) = delete; + + /// Move-assignment is prohibited. + /// + writer_t& + operator=(writer_t&&) = delete; + + }; + + /// String writer. + /// + class string_writer_t + : public writer_t + { + public: + /// Constructor. + /// + /// \param str Reference to the output string. + /// + explicit + string_writer_t(std::string& str) + : _str{str} + {} + + /// Copy-construction is prohibited. + /// + string_writer_t(string_writer_t const&) = delete; + + /// Move-construction is prohibited. + /// + string_writer_t(string_writer_t&&) = delete; + + void + write(uint8_t c) override + { + _str += static_cast(c); + } + + /// Copy-assignment is prohibited. + /// + string_writer_t& + operator=(string_writer_t const&) = delete; + + /// Move-assignment is prohibited. + /// + string_writer_t& + operator=(string_writer_t&&) = delete; + + private: + std::string& _str; + + }; + +} // namespace art::unicode + +#endif diff --git a/art/unicode/writer.test.cxx b/art/unicode/writer.test.cxx new file mode 100644 index 0000000..71aafe1 --- /dev/null +++ b/art/unicode/writer.test.cxx @@ -0,0 +1,22 @@ +#include + +#include + +#include + +VALIDATION_TEST(test_XYZ) +{ + std::string str; + art::unicode::string_writer_t writer{str}; + writer.write('X'); + writer.write('Y'); + writer.write('Z'); + + VALIDATION_ASSERT_EQUAL(str, "XYZ"); +} + +int +main(int argc, char* argv[]) +{ + return art::validation::main(argc, argv); +} diff --git a/build/.gitignore b/build/.gitignore new file mode 100644 index 0000000..974e01d --- /dev/null +++ b/build/.gitignore @@ -0,0 +1,4 @@ +/config.build +/root/ +/bootstrap/ +build/ diff --git a/build/bootstrap.build b/build/bootstrap.build new file mode 100644 index 0000000..f967ec5 --- /dev/null +++ b/build/bootstrap.build @@ -0,0 +1,7 @@ +project = libart-unicode + +using version +using config +using test +using install +using dist diff --git a/build/export.build b/build/export.build new file mode 100644 index 0000000..248d695 --- /dev/null +++ b/build/export.build @@ -0,0 +1,6 @@ +$out_root/ +{ + include art/unicode/ +} + +export $out_root/art/unicode/$import.target diff --git a/build/root.build b/build/root.build new file mode 100644 index 0000000..21e0a2e --- /dev/null +++ b/build/root.build @@ -0,0 +1,16 @@ +# Uncomment to suppress warnings coming from external libraries. +# +#cxx.internal.scope = current + +cxx.std = latest + +using cxx + +hxx{*}: extension = hxx +ixx{*}: extension = ixx +txx{*}: extension = txx +cxx{*}: extension = cxx + +# The test target for cross-testing (running tests under Wine, etc). +# +test.target = $cxx.target diff --git a/buildfile b/buildfile new file mode 100644 index 0000000..bc07bdc --- /dev/null +++ b/buildfile @@ -0,0 +1,5 @@ +./: {art/ tests/} doc{README.md} legal{LICENSE} manifest + +# Don't install tests. +# +tests/: install = false diff --git a/manifest b/manifest new file mode 100644 index 0000000..fc10481 --- /dev/null +++ b/manifest @@ -0,0 +1,12 @@ +: 1 +name: libart-unicode +version: 0.1.0-a.0.z +language: c++ +summary: libart-unicode C++ library +license: BSD-4-Clause +description-file: README.md +url: https://art.helloryan.se/ +email: art@helloryan.se +depends: * build2 >= 0.17.0 +depends: * bpkg >= 0.17.0 +depends: libart-validation ^0.1.0- diff --git a/repositories.manifest b/repositories.manifest new file mode 100644 index 0000000..b4a00e9 --- /dev/null +++ b/repositories.manifest @@ -0,0 +1,6 @@ +: 1 +summary: libart-unicode project repository + +: +role: prerequisite +location: https://code.helloryan.se/art/libart-validation.git##HEAD diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 0000000..662178d --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1,8 @@ +# Test executables. +# +driver + +# Testscript output directories (can be symlinks). +# +test +test-* diff --git a/tests/build/.gitignore b/tests/build/.gitignore new file mode 100644 index 0000000..974e01d --- /dev/null +++ b/tests/build/.gitignore @@ -0,0 +1,4 @@ +/config.build +/root/ +/bootstrap/ +build/ diff --git a/tests/build/bootstrap.build b/tests/build/bootstrap.build new file mode 100644 index 0000000..a07b5ea --- /dev/null +++ b/tests/build/bootstrap.build @@ -0,0 +1,5 @@ +project = # Unnamed tests subproject. + +using config +using test +using dist diff --git a/tests/build/root.build b/tests/build/root.build new file mode 100644 index 0000000..a67b2fe --- /dev/null +++ b/tests/build/root.build @@ -0,0 +1,16 @@ +cxx.std = latest + +using cxx + +hxx{*}: extension = hxx +ixx{*}: extension = ixx +txx{*}: extension = txx +cxx{*}: extension = cxx + +# Every exe{} in this subproject is by default a test. +# +exe{*}: test = true + +# The test target for cross-testing (running tests under Wine, etc). +# +test.target = $cxx.target diff --git a/tests/buildfile b/tests/buildfile new file mode 100644 index 0000000..aeeab15 --- /dev/null +++ b/tests/buildfile @@ -0,0 +1 @@ +./: {*/ -build/}