Boost C++ Libraries

...one of the most highly regarded and expertly designed C++ library projects in the world. Herb Sutter and Andrei Alexandrescu, C++ Coding Standards

This is the documentation for an old version of Boost. Click here to view this page for the latest version.
Boost.Nowide
utf8_codecvt.hpp
1 //
2 // Copyright (c) 2015 Artyom Beilis (Tonkikh)
3 // Copyright (c) 2020 Alexander Grund
4 //
5 // Distributed under the Boost Software License, Version 1.0. (See
6 // accompanying file LICENSE or copy at
7 // http://www.boost.org/LICENSE_1_0.txt)
8 //
9 #ifndef BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
10 #define BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
11 
13 #include <boost/nowide/utf/utf.hpp>
14 #include <cstdint>
15 #include <locale>
16 
17 namespace boost {
18 namespace nowide {
19 
20  static_assert(sizeof(std::mbstate_t) >= 2, "mbstate_t is to small to store an UTF-16 codepoint");
21  namespace detail {
22  // Avoid including cstring for std::memcpy
23  inline void copy_uint16_t(void* dst, const void* src)
24  {
25  unsigned char* cdst = static_cast<unsigned char*>(dst);
26  const unsigned char* csrc = static_cast<const unsigned char*>(src);
27  cdst[0] = csrc[0];
28  cdst[1] = csrc[1];
29  }
30  inline std::uint16_t read_state(const std::mbstate_t& src)
31  {
32  std::uint16_t dst;
33  copy_uint16_t(&dst, &src);
34  return dst;
35  }
36  inline void write_state(std::mbstate_t& dst, const std::uint16_t src)
37  {
38  copy_uint16_t(&dst, &src);
39  }
40  } // namespace detail
41 
42 #if defined _MSC_VER && _MSC_VER < 1700
43 // MSVC do_length is non-standard it counts wide characters instead of narrow and does not change mbstate
44 #define BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
45 #endif
46 
53  template<typename CharType, int CharSize = sizeof(CharType)>
54  class utf8_codecvt;
55 
57  template<typename CharType>
58  class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 2> : public std::codecvt<CharType, char, std::mbstate_t>
59  {
60  public:
61  static_assert(sizeof(CharType) >= 2, "CharType must be able to store UTF16 code point");
62 
63  utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
64  {}
65 
66  protected:
67  typedef CharType uchar;
68 
69  std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
70  {
71  if(detail::read_state(s) != 0)
72  return std::codecvt_base::error;
73  next = from;
74  return std::codecvt_base::ok;
75  }
76  int do_encoding() const noexcept override
77  {
78  return 0;
79  }
80  int do_max_length() const noexcept override
81  {
82  return 4;
83  }
84  bool do_always_noconv() const noexcept override
85  {
86  return false;
87  }
88 
89  int do_length(std::mbstate_t
90 #ifdef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
91  const
92 #endif
93  & std_state,
94  const char* from,
95  const char* from_end,
96  size_t max) const override
97  {
98  std::uint16_t state = detail::read_state(std_state);
99 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
100  const char* save_from = from;
101 #else
102  size_t save_max = max;
103 #endif
104  while(max > 0 && from < from_end)
105  {
106  const char* prev_from = from;
107  std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
108  if(ch == utf::illegal)
109  {
111  } else if(ch == utf::incomplete)
112  {
113  from = prev_from;
114  break;
115  }
116  max--;
117  if(ch > 0xFFFF)
118  {
119  if(state == 0)
120  {
121  from = prev_from;
122  state = 1;
123  } else
124  {
125  state = 0;
126  }
127  }
128  }
129 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
130  detail::write_state(std_state, state);
131  return static_cast<int>(from - save_from);
132 #else
133  return static_cast<int>(save_max - max);
134 #endif
135  }
136 
137  std::codecvt_base::result do_in(std::mbstate_t& std_state,
138  const char* from,
139  const char* from_end,
140  const char*& from_next,
141  uchar* to,
142  uchar* to_end,
143  uchar*& to_next) const override
144  {
145  std::codecvt_base::result r = std::codecvt_base::ok;
146 
147  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
148  // according to standard. We use it to keep a flag 0/1 for surrogate pair writing
149  //
150  // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observed
151  // and first pair is written, but no input consumed
152  std::uint16_t state = detail::read_state(std_state);
153  while(to < to_end && from < from_end)
154  {
155  const char* from_saved = from;
156 
157  uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
158 
159  if(ch == utf::illegal)
160  {
162  } else if(ch == utf::incomplete)
163  {
164  from = from_saved;
165  r = std::codecvt_base::partial;
166  break;
167  }
168  // Normal codepoints go directly to stream
169  if(ch <= 0xFFFF)
170  {
171  *to++ = static_cast<CharType>(ch);
172  } else
173  {
174  // for other codepoints we do following
175  //
176  // 1. We can't consume our input as we may find ourself
177  // in state where all input consumed but not all output written,i.e. only
178  // 1st pair is written
179  // 2. We only write first pair and mark this in the state, we also revert back
180  // the from pointer in order to make sure this codepoint would be read
181  // once again and then we would consume our input together with writing
182  // second surrogate pair
183  ch -= 0x10000;
184  std::uint16_t vh = static_cast<std::uint16_t>(ch >> 10);
185  std::uint16_t vl = ch & 0x3FF;
186  std::uint16_t w1 = vh + 0xD800;
187  std::uint16_t w2 = vl + 0xDC00;
188  if(state == 0)
189  {
190  from = from_saved;
191  *to++ = static_cast<CharType>(w1);
192  state = 1;
193  } else
194  {
195  *to++ = static_cast<CharType>(w2);
196  state = 0;
197  }
198  }
199  }
200  from_next = from;
201  to_next = to;
202  if(r == std::codecvt_base::ok && (from != from_end || state != 0))
203  r = std::codecvt_base::partial;
204  detail::write_state(std_state, state);
205  return r;
206  }
207 
208  std::codecvt_base::result do_out(std::mbstate_t& std_state,
209  const uchar* from,
210  const uchar* from_end,
211  const uchar*& from_next,
212  char* to,
213  char* to_end,
214  char*& to_next) const override
215  {
216  std::codecvt_base::result r = std::codecvt_base::ok;
217  // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
218  // according to standard. We assume that sizeof(mbstate_t) >=2 in order
219  // to be able to store first observed surrogate pair
220  //
221  // State: state!=0 - a first surrogate pair was observed (state = first pair),
222  // we expect the second one to come and then zero the state
224  std::uint16_t state = detail::read_state(std_state);
225  while(to < to_end && from < from_end)
226  {
227  std::uint32_t ch = 0;
228  if(state != 0)
229  {
230  // if the state indicates that 1st surrogate pair was written
231  // we should make sure that the second one that comes is actually
232  // second surrogate
233  std::uint16_t w1 = state;
234  std::uint16_t w2 = *from;
235  // we don't forward from as writing may fail to incomplete or
236  // partial conversion
237  if(0xDC00 <= w2 && w2 <= 0xDFFF)
238  {
239  std::uint16_t vh = w1 - 0xD800;
240  std::uint16_t vl = w2 - 0xDC00;
241  ch = ((uint32_t(vh) << 10) | vl) + 0x10000;
242  } else
243  {
245  }
246  } else
247  {
248  ch = *from;
249  if(0xD800 <= ch && ch <= 0xDBFF)
250  {
251  // if this is a first surrogate pair we put
252  // it into the state and consume it, note we don't
253  // go forward as it should be illegal so we increase
254  // the from pointer manually
255  state = static_cast<std::uint16_t>(ch);
256  from++;
257  continue;
258  } else if(0xDC00 <= ch && ch <= 0xDFFF)
259  {
260  // if we observe second surrogate pair and
261  // first only may be expected we should break from the loop with error
262  // as it is illegal input
264  }
265  }
266  if(!utf::is_valid_codepoint(ch))
267  {
268  r = std::codecvt_base::error;
269  break;
270  }
271  int len = utf::utf_traits<char>::width(ch);
272  if(to_end - to < len)
273  {
274  r = std::codecvt_base::partial;
275  break;
276  }
277  to = utf::utf_traits<char>::encode(ch, to);
278  state = 0;
279  from++;
280  }
281  from_next = from;
282  to_next = to;
283  if(r == std::codecvt_base::ok && (from != from_end || state != 0))
284  r = std::codecvt_base::partial;
285  detail::write_state(std_state, state);
286  return r;
287  }
288  };
289 
291  template<typename CharType>
292  class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 4> : public std::codecvt<CharType, char, std::mbstate_t>
293  {
294  public:
295  utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
296  {}
297 
298  protected:
299  typedef CharType uchar;
300 
301  std::codecvt_base::result
302  do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
303  {
304  next = from;
305  return std::codecvt_base::ok;
306  }
307  int do_encoding() const noexcept override
308  {
309  return 0;
310  }
311  int do_max_length() const noexcept override
312  {
313  return 4;
314  }
315  bool do_always_noconv() const noexcept override
316  {
317  return false;
318  }
319 
320  int do_length(std::mbstate_t
321 #ifdef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
322  const
323 #endif
324  & /*state*/,
325  const char* from,
326  const char* from_end,
327  size_t max) const override
328  {
329 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
330  const char* start_from = from;
331 #else
332  size_t save_max = max;
333 #endif
334 
335  while(max > 0 && from < from_end)
336  {
337  const char* save_from = from;
338  std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
339  if(ch == utf::incomplete)
340  {
341  from = save_from;
342  break;
343  } else if(ch == utf::illegal)
344  {
346  }
347  max--;
348  }
349 #ifndef BOOST_NOWIDE_DO_LENGTH_MBSTATE_CONST
350  return from - start_from;
351 #else
352  return save_max - max;
353 #endif
354  }
355 
356  std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
357  const char* from,
358  const char* from_end,
359  const char*& from_next,
360  uchar* to,
361  uchar* to_end,
362  uchar*& to_next) const override
363  {
364  std::codecvt_base::result r = std::codecvt_base::ok;
365 
366  while(to < to_end && from < from_end)
367  {
368  const char* from_saved = from;
369 
370  uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
371 
372  if(ch == utf::illegal)
373  {
375  } else if(ch == utf::incomplete)
376  {
377  r = std::codecvt_base::partial;
378  from = from_saved;
379  break;
380  }
381  *to++ = ch;
382  }
383  from_next = from;
384  to_next = to;
385  if(r == std::codecvt_base::ok && from != from_end)
386  r = std::codecvt_base::partial;
387  return r;
388  }
389 
390  std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
391  const uchar* from,
392  const uchar* from_end,
393  const uchar*& from_next,
394  char* to,
395  char* to_end,
396  char*& to_next) const override
397  {
398  std::codecvt_base::result r = std::codecvt_base::ok;
399  while(to < to_end && from < from_end)
400  {
401  std::uint32_t ch = 0;
402  ch = *from;
403  if(!utf::is_valid_codepoint(ch))
404  {
406  }
407  int len = utf::utf_traits<char>::width(ch);
408  if(to_end - to < len)
409  {
410  r = std::codecvt_base::partial;
411  break;
412  }
413  to = utf::utf_traits<char>::encode(ch, to);
414  from++;
415  }
416  from_next = from;
417  to_next = to;
418  if(r == std::codecvt_base::ok && from != from_end)
419  r = std::codecvt_base::partial;
420  return r;
421  }
422  };
423 
424 } // namespace nowide
425 } // namespace boost
426 
427 #endif
Definition: utf8_codecvt.hpp:54
#define BOOST_NOWIDE_REPLACEMENT_CHARACTER
Definition: replacement.hpp:16