BashSpark
Loading...
Searching...
No Matches
utf.h
Go to the documentation of this file.
1
32#pragma once
33
34#include <iomanip>
35#include <regex>
36#include <cstdint>
37#include <iostream>
38
40
41namespace bs {
49 constexpr char32_t combine_surrogates(const char16_t high, const char16_t low) {
50 return ((static_cast<char32_t>(high) - 0xD800) << 10
51 | static_cast<char32_t>(low) - 0xDC00) + 0x10000;
52 }
53
61 inline std::string to_hex_string(const char32_t cChar, const std::size_t nLength) {
62 std::ostringstream oStream;
63 if (nLength == 1) oStream.write("\\x", 2);
64 if (nLength == 2) oStream.write("\\u", 2);
65 if (nLength == 4) oStream.write("\\U", 2);
66 oStream << std::hex << std::setfill('0')
67 << std::setw(static_cast<std::ostream::int_type>(nLength))
68 << static_cast<std::uint32_t>(cChar);
69 return oStream.str();
70 }
71
81 inline void write_char32_t(ofakestream &oStream, const char32_t cChar) {
82 if (cChar <= 0x7F) {
83 // 1 byte (ASCII range)
84 oStream.put(static_cast<char>(cChar));
85 } else if (cChar <= 0x7FF) {
86 // 2 bytes
87 oStream.put(static_cast<char>(cChar >> 6 | 0xC0));
88 oStream.put(static_cast<char>(cChar & 0x3F | 0x80));
89 } else if (cChar <= 0xFFFF) {
90 // 3 bytes
91 oStream.put(static_cast<char>(cChar >> 12 | 0xE0));
92 oStream.put(static_cast<char>(cChar >> 6 & 0x3F | 0x80));
93 oStream.put(static_cast<char>(cChar & 0x3F | 0x80));
94 } else if (cChar <= 0x10FFFF) {
95 // 4 bytes
96 oStream.put(static_cast<char>(cChar >> 18 | 0xF0));
97 oStream.put(static_cast<char>(cChar >> 12 & 0x3F | 0x80));
98 oStream.put(static_cast<char>(cChar >> 6 & 0x3F | 0x80));
99 oStream.put(static_cast<char>(cChar & 0x3F | 0x80));
100 }
101 }
102
109 inline std::string write_char32_t(const char32_t cChar) {
110 ofakestream oStream;
111 write_char32_t(oStream, cChar);
112 return oStream.str();
113 }
114
128 inline bool parse_utf(ifakestream &oIstream, const std::size_t nCount, char32_t &cResult) {
129 // Read
130 std::string sValue(nCount * 2, '\0');
131 oIstream.read(sValue.data(), nCount * 2);
132
133 // Check hex
134 static std::regex oRegex("^[0-9A-Fa-f]+$");
135 if (!std::regex_match(sValue, oRegex))
136 return false;
137 auto cChar = static_cast<char32_t>(std::stoull(sValue, nullptr, 16));
138
139 switch (nCount) {
140 case 1: {
141 if (cChar > 0x7F)
142 return false;
143 break;
144 }
145 case 2: {
146 // Deal with surrogates
147 if (cChar >= 0xD800 && cChar <= 0xDBFF) {
148 if (oIstream.get() != '\\') return false;
149 if (oIstream.get() != 'u') return false;
150
151 // Load other surrogate and check regex
152 oIstream.read(sValue.data(), nCount * 2);
153 if (!std::regex_match(sValue, oRegex))
154 return false;
155
156 const auto cLow = static_cast<char32_t>(std::stoull(sValue, nullptr, 16));
157 if (cLow < 0xDC00 || cLow > 0xDFFF)
158 return false;
159
160 cChar = combine_surrogates(
161 static_cast<char16_t>(cChar),
162 static_cast<char16_t>(cLow)
163 );
164 } else if (cChar >= 0xDC00) {
165 // High surrogates and above
166 return false;
167 }
168 break;
169 }
170
171 case 4: {
172 if ((cChar >= 0xD800 && cChar <= 0xDFFF) || cChar > 0x10FFFF)
173 return false;
174 break;
175 }
176
177 default: break;
178 }
179
180 cResult = cChar;
181 return true;
182 }
183}
A class for input stream behavior with a character type.
Definition fakestream.h:70
ALWAYS_INLINE int_type get() noexcept
Retrieves the next character character from the stream and moves to the next.
Definition fakestream.h:119
ALWAYS_INLINE std::size_t read(char_type *const pBuffer, const std::size_t nCount) noexcept
Reads a specified number of characters into a buffer.
Definition fakestream.h:154
A class for output stream behavior with a character type.
Definition fakestream.h:263
ALWAYS_INLINE std::basic_string< char_type > str() const
Converts the current data to a string.
Definition fakestream.h:382
ALWAYS_INLINE void put(const char_type cChar)
Writes a single character to the stream.
Definition fakestream.h:309
Generic input and output stream classes with character types.
BashSpark main namespace.
Definition command.h:39
bool parse_utf(ifakestream &oIstream, const std::size_t nCount, char32_t &cResult)
Parses a UTF-n encoded character from an input stream.
Definition utf.h:128
std::string to_hex_string(const char32_t cChar, const std::size_t nLength)
Converts a UTF-32 character to a hexadecimal string representation.
Definition utf.h:61
constexpr char32_t combine_surrogates(const char16_t high, const char16_t low)
Combines high and low UTF-16 surrogates into a UTF-32 code point.
Definition utf.h:49
void write_char32_t(ofakestream &oStream, const char32_t cChar)
Writes a UTF-32 character to an output stream in UTF-8 encoding.
Definition utf.h:81