Boost C++ Libraries

...one of the most highly regarded and expertly designed C++ library projects in the world. Herb Sutter and Andrei Alexandrescu, C++ Coding Standards

This is the documentation for an old version of Boost. Click here to view this page for the latest version.
Boost.Nowide
utf8_codecvt.hpp
1 //
2 // Copyright (c) 2015 Artyom Beilis (Tonkikh)
3 // Copyright (c) 2020 Alexander Grund
4 //
5 // Distributed under the Boost Software License, Version 1.0. (See
6 // accompanying file LICENSE or copy at
7 // http://www.boost.org/LICENSE_1_0.txt)
8 //
9 #ifndef BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
10 #define BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
11 
13 #include <boost/nowide/utf/utf.hpp>
14 #include <cassert>
15 #include <cstdint>
16 #include <locale>
17 
18 namespace boost {
19 namespace nowide {
20 
21  static_assert(sizeof(std::mbstate_t) >= 2, "mbstate_t is to small to store an UTF-16 codepoint");
22  namespace detail {
23  // Avoid including cstring for std::memcpy
24  inline void copy_uint16_t(void* dst, const void* src)
25  {
26  unsigned char* cdst = static_cast<unsigned char*>(dst);
27  const unsigned char* csrc = static_cast<const unsigned char*>(src);
28  cdst[0] = csrc[0];
29  cdst[1] = csrc[1];
30  }
31  inline std::uint16_t read_state(const std::mbstate_t& src)
32  {
33  std::uint16_t dst;
34  copy_uint16_t(&dst, &src);
35  return dst;
36  }
37  inline void write_state(std::mbstate_t& dst, const std::uint16_t src)
38  {
39  copy_uint16_t(&dst, &src);
40  }
41  } // namespace detail
42 
49  template<typename CharType, int CharSize = sizeof(CharType)>
50  class utf8_codecvt;
51 
52  BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_BEGIN
54  template<typename CharType>
55  class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 2> : public std::codecvt<CharType, char, std::mbstate_t>
56  {
57  public:
58  static_assert(sizeof(CharType) >= 2, "CharType must be able to store UTF16 code point");
59 
60  utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
61  {}
62  BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_END
63 
64  protected:
65  using uchar = CharType;
66 
67  std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
68  {
69  if(detail::read_state(s) != 0)
70  return std::codecvt_base::error;
71  next = from;
72  return std::codecvt_base::ok;
73  }
74  int do_encoding() const noexcept override
75  {
76  return 0;
77  }
78  int do_max_length() const noexcept override
79  {
80  return 4;
81  }
82  bool do_always_noconv() const noexcept override
83  {
84  return false;
85  }
86 
87  // LCOV_EXCL_START
88  int do_length(std::mbstate_t& std_state, const char* from, const char* from_end, size_t max) const override
89  {
90  // LCOV_EXCL_STOP
91  using utf16_traits = utf::utf_traits<uchar, 2>;
92  std::uint16_t state = detail::read_state(std_state);
93  const char* save_from = from;
94  if(state && max > 0)
95  {
96  max--;
97  state = 0;
98  }
99  while(max > 0 && from < from_end)
100  {
101  const char* prev_from = from;
102  std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
103  if(ch == utf::illegal)
104  {
106  } else if(ch == utf::incomplete)
107  {
108  from = prev_from;
109  break;
110  }
111  // If we can't write the char, we have to save the low surrogate in state
112  if(BOOST_LIKELY(static_cast<size_t>(utf16_traits::width(ch)) <= max))
113  {
114  max -= utf16_traits::width(ch);
115  } else
116  {
117  static_assert(utf16_traits::max_width == 2, "Required for below");
118  std::uint16_t tmpOut[2]{};
119  utf16_traits::encode(ch, tmpOut);
120  state = tmpOut[1];
121  break;
122  }
123  }
124  detail::write_state(std_state, state);
125  return static_cast<int>(from - save_from);
126  }
127 
128  std::codecvt_base::result do_in(std::mbstate_t& std_state, // LCOV_EXCL_LINE
129  const char* from,
130  const char* from_end,
131  const char*& from_next,
132  uchar* to,
133  uchar* to_end,
134  uchar*& to_next) const override
135  {
136  std::codecvt_base::result r = std::codecvt_base::ok;
137  using utf16_traits = utf::utf_traits<uchar, 2>;
138 
139  // mbstate_t is POD type and should be initialized to 0 (i.e. state = stateT())
140  // according to standard.
141  // We use it to store a low surrogate if it was not yet written, else state is 0
142  std::uint16_t state = detail::read_state(std_state);
143  // Write low surrogate if present
144  if(state && to < to_end)
145  {
146  *to++ = static_cast<CharType>(state);
147  state = 0;
148  }
149  while(to < to_end && from < from_end)
150  {
151  const char* from_saved = from;
152 
153  uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
154 
155  if(ch == utf::illegal)
156  {
158  } else if(ch == utf::incomplete)
159  {
160  from = from_saved;
161  r = std::codecvt_base::partial;
162  break;
163  }
164  // If the encoded char fits, write directly, else safe the low surrogate in state
165  if(BOOST_LIKELY(utf16_traits::width(ch) <= to_end - to))
166  {
167  to = utf16_traits::encode(ch, to);
168  } else
169  {
170  static_assert(utf16_traits::max_width == 2, "Required for below");
171  std::uint16_t tmpOut[2]{};
172  utf16_traits::encode(ch, tmpOut);
173  *to++ = static_cast<CharType>(tmpOut[0]);
174  state = tmpOut[1];
175  break;
176  }
177  }
178  from_next = from;
179  to_next = to;
180  if(r == std::codecvt_base::ok && (from != from_end || state != 0))
181  r = std::codecvt_base::partial;
182  detail::write_state(std_state, state);
183  return r;
184  }
185 
186  std::codecvt_base::result do_out(std::mbstate_t& std_state,
187  const uchar* from,
188  const uchar* from_end,
189  const uchar*& from_next,
190  char* to,
191  char* to_end,
192  char*& to_next) const override
193  {
194  std::codecvt_base::result r = std::codecvt_base::ok;
195  using utf16_traits = utf::utf_traits<uchar, 2>;
196  // mbstate_t is POD type and should be initialized to 0
197  // (i.e. state = stateT()) according to standard.
198  // We use it to store the first observed surrogate pair, or 0 if there is none yet
199  std::uint16_t state = detail::read_state(std_state);
200  for(; to < to_end && from < from_end; ++from)
201  {
202  std::uint32_t ch = 0;
203  if(state != 0)
204  {
205  // We have a high surrogate, so now there should be a low surrogate
206  std::uint16_t w1 = state;
207  std::uint16_t w2 = *from;
208  if(BOOST_LIKELY(utf16_traits::is_trail(w2)))
209  {
210  ch = utf16_traits::combine_surrogate(w1, w2);
211  } else
212  {
214  }
215  } else
216  {
217  std::uint16_t w1 = *from;
218  if(BOOST_LIKELY(utf16_traits::is_single_codepoint(w1)))
219  {
220  ch = w1;
221  } else if(BOOST_LIKELY(utf16_traits::is_first_surrogate(w1)))
222  {
223  // Store into state and continue at next character
224  state = w1;
225  continue;
226  } else
227  {
228  // Neither a single codepoint nor a high surrogate so must be low surrogate.
229  // This is an error -> Replace character
231  }
232  }
233  assert(utf::is_valid_codepoint(ch)); // Any valid UTF16 sequence is a valid codepoint
234  int len = utf::utf_traits<char>::width(ch);
235  if(to_end - to < len)
236  {
237  r = std::codecvt_base::partial;
238  break;
239  }
240  to = utf::utf_traits<char>::encode(ch, to);
241  state = 0;
242  }
243  from_next = from;
244  to_next = to;
245  if(r == std::codecvt_base::ok && (from != from_end || state != 0))
246  r = std::codecvt_base::partial;
247  detail::write_state(std_state, state);
248  return r;
249  }
250  };
251 
252  BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_BEGIN
254  template<typename CharType>
255  class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 4> : public std::codecvt<CharType, char, std::mbstate_t>
256  {
257  public:
258  utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
259  {}
260  BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_END
261 
262  protected:
263  using uchar = CharType;
264 
265  std::codecvt_base::result
266  do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
267  {
268  next = from;
269  return std::codecvt_base::noconv;
270  }
271  int do_encoding() const noexcept override
272  {
273  return 0;
274  }
275  int do_max_length() const noexcept override
276  {
277  return 4;
278  }
279  bool do_always_noconv() const noexcept override
280  {
281  return false;
282  }
283 
284  int do_length(std::mbstate_t& /*state*/, const char* from, const char* from_end, size_t max) const override
285  {
286  const char* start_from = from;
287 
288  while(max > 0 && from < from_end)
289  {
290  const char* save_from = from;
291  std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
292  if(ch == utf::incomplete)
293  {
294  from = save_from;
295  break;
296  } else if(ch == utf::illegal)
297  {
299  }
300  max--;
301  }
302  return static_cast<int>(from - start_from);
303  }
304 
305  std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
306  const char* from,
307  const char* from_end,
308  const char*& from_next,
309  uchar* to,
310  uchar* to_end,
311  uchar*& to_next) const override
312  {
313  std::codecvt_base::result r = std::codecvt_base::ok;
314 
315  while(to < to_end && from < from_end)
316  {
317  const char* from_saved = from;
318 
319  uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
320 
321  if(ch == utf::illegal)
322  {
324  } else if(ch == utf::incomplete)
325  {
326  r = std::codecvt_base::partial;
327  from = from_saved;
328  break;
329  }
330  *to++ = ch;
331  }
332  from_next = from;
333  to_next = to;
334  if(r == std::codecvt_base::ok && from != from_end)
335  r = std::codecvt_base::partial;
336  return r;
337  }
338 
339  std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
340  const uchar* from,
341  const uchar* from_end,
342  const uchar*& from_next,
343  char* to,
344  char* to_end,
345  char*& to_next) const override
346  {
347  std::codecvt_base::result r = std::codecvt_base::ok;
348  while(to < to_end && from < from_end)
349  {
350  std::uint32_t ch = 0;
351  ch = *from;
352  if(!utf::is_valid_codepoint(ch))
353  {
355  }
356  int len = utf::utf_traits<char>::width(ch);
357  if(to_end - to < len)
358  {
359  r = std::codecvt_base::partial;
360  break;
361  }
362  to = utf::utf_traits<char>::encode(ch, to);
363  from++;
364  }
365  from_next = from;
366  to_next = to;
367  if(r == std::codecvt_base::ok && from != from_end)
368  r = std::codecvt_base::partial;
369  return r;
370  }
371  };
372 
373 } // namespace nowide
374 } // namespace boost
375 
376 #endif
UTF Traits class - functions to convert UTF sequences to and from Unicode code points.
Definition: utf.hpp:57
static Iterator encode(code_point value, Iterator out)
Definition: utf8_codecvt.hpp:50
static const code_point illegal
Special constant that defines illegal code point.
Definition: utf.hpp:33
#define BOOST_NOWIDE_REPLACEMENT_CHARACTER
Definition: replacement.hpp:16
static const code_point incomplete
Special constant that defines incomplete code point.
Definition: utf.hpp:38
static int width(code_point value)
bool is_valid_codepoint(code_point v)
the function checks if v is a valid code point
Definition: utf.hpp:43
static code_point decode(Iterator &p, Iterator e)