KiCad PCB EDA Suite
utf8.cpp
Go to the documentation of this file.
1 /*
2  * This program source code file is part of KiCad, a free EDA CAD application.
3  *
4  * Copyright (C) 2013-2017 SoftPLC Corporation, Dick Hollenbeck <dick@softplc.com>
5  * Copyright (C) 2013-2021 KiCad Developers, see AUTHORS.txt for contributors.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, you may find one here:
19  * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
20  * or you may search the http://www.gnu.org website for the version 2 license,
21  * or you may write to the Free Software Foundation, Inc.,
22  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23  */
24 
25 #include <utf8.h>
26 #include <ki_exception.h>
27 #include <wx/strconv.h>
28 #include <wx/buffer.h>
29 #include <vector>
30 
31 #include <cassert>
32 
33 
34 /*
35  These are not inlined so that code space is saved by encapsulating the
36  creation of intermediate objects and the referencing of wxConvUTF8.
37 */
38 
39 
40 UTF8::UTF8( const wxString& o ) :
41  m_s( (const char*) o.utf8_str() )
42 {
43 }
44 
45 
46 wxString UTF8::wx_str() const
47 {
48  return wxString( c_str(), wxConvUTF8 );
49 }
50 
51 
52 UTF8::operator wxString () const
53 {
54  return wxString( c_str(), wxConvUTF8 );
55 }
56 
57 
58 UTF8& UTF8::operator=( const wxString& o )
59 {
60  m_s = (const char*) o.utf8_str();
61  return *this;
62 }
63 
64 
65 // There is no wxWidgets function that does this, because wchar_t is 16 bits
66 // on windows and wx wants to encode the output in UTF16 for such.
67 
68 int UTF8::uni_forward( const unsigned char* aSequence, unsigned* aResult )
69 {
70  unsigned ch = *aSequence;
71 
72  if( ch < 0x80 )
73  {
74  if( aResult )
75  *aResult = ch;
76  return 1;
77  }
78 
79  const unsigned char* s = aSequence;
80 
81  static const unsigned char utf8_len[] = {
82  // Map encoded prefix byte to sequence length. Zero means
83  // illegal prefix. See RFC 3629 for details
84  /*
85  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00-0F
86  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
88  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
89  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
90  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
91  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
92  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70-7F
93  */
94  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80-8F
95  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0-BF
98  0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0-C1 + C2-CF
99  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0-DF
100  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0-EF
101  4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F0-F4 + F5-FF
102  };
103 
104  int len = utf8_len[ *s - 0x80 /* top half of table is missing */ ];
105 
106  switch( len )
107  {
108  default:
109  case 0:
110  if( aResult )
111  wxFAIL_MSG( "uni_forward: invalid start byte" );
112 
113  return 0;
114  break;
115 
116  case 2:
117  if( ( s[1] & 0xc0 ) != 0x80 )
118  {
119  if( aResult )
120  wxFAIL_MSG( "uni_forward: invalid continuation byte" );
121 
122  return 0;
123  }
124 
125  ch = ((s[0] & 0x1f) << 6) +
126  ((s[1] & 0x3f) << 0);
127 
128  // assert( ch > 0x007F && ch <= 0x07FF );
129  break;
130 
131  case 3:
132  if( (s[1] & 0xc0) != 0x80 ||
133  (s[2] & 0xc0) != 0x80 ||
134  (s[0] == 0xE0 && s[1] < 0xA0)
135  // || (s[0] == 0xED && s[1] > 0x9F)
136  )
137  {
138  if( aResult )
139  wxFAIL_MSG( "uni_forward: invalid continuation byte" );
140 
141  return 0;
142  }
143 
144  ch = ((s[0] & 0x0f) << 12) +
145  ((s[1] & 0x3f) << 6 ) +
146  ((s[2] & 0x3f) << 0 );
147 
148  // assert( ch > 0x07FF && ch <= 0xFFFF );
149  break;
150 
151  case 4:
152  if( (s[1] & 0xc0) != 0x80 ||
153  (s[2] & 0xc0) != 0x80 ||
154  (s[3] & 0xc0) != 0x80 ||
155  (s[0] == 0xF0 && s[1] < 0x90) ||
156  (s[0] == 0xF4 && s[1] > 0x8F) )
157  {
158  if( aResult )
159  wxFAIL_MSG( "uni_forward: invalid continuation byte" );
160 
161  return 0;
162  }
163 
164  ch = ((s[0] & 0x7) << 18) +
165  ((s[1] & 0x3f) << 12) +
166  ((s[2] & 0x3f) << 6 ) +
167  ((s[3] & 0x3f) << 0 );
168 
169  // assert( ch > 0xFFFF && ch <= 0x10ffff );
170  break;
171  }
172 
173  if( aResult )
174  *aResult = ch;
175 
176  return len;
177 }
178 
179 
180 bool IsUTF8( const char* aString )
181 {
182  int len = strlen( aString );
183 
184  if( len )
185  {
186  const unsigned char* next = (unsigned char*) aString;
187  const unsigned char* end = next + len;
188 
189  while( next < end )
190  {
191  int charLen = UTF8::uni_forward( next, nullptr );
192 
193  if( charLen == 0 )
194  return false;
195 
196  next += charLen;
197  }
198 
199  // uni_forward() should find the exact end if it is truly UTF8
200  if( next > end )
201  return false;
202  }
203 
204  return true;
205 }
206 
207 
208 UTF8::UTF8( const wchar_t* txt )
209 {
210  try
211  {
212  std::vector< char > temp( wcslen( txt ) * 4 + 1 );
213  wxConvUTF8.WC2MB( temp.data(), txt, temp.size() );
214  m_s.assign( temp.data() );
215  }
216  catch(...)
217  {
218  auto string = wxSafeConvertWX2MB( txt );
219  m_s.assign( string );
220  }
221 
222  m_s.shrink_to_fit();
223 }
224 
225 
226 UTF8& UTF8::operator+=( unsigned w_ch )
227 {
228  if( w_ch <= 0x7F )
229  {
230  m_s.operator+=( char( w_ch ) );
231  }
232  else
233  {
234  //TODO: Remove wchar use. Replace with std::byte*
235  wchar_t wide_chr[2]; // buffer to store wide chars (UTF16) read from aText
236  wide_chr[1] = 0;
237  wide_chr[0] = w_ch;
238  UTF8 substr( wide_chr );
239  m_s += substr.m_s;
240  }
241 
242  return *this;
243 }
CITER next(CITER it)
Definition: ptree.cpp:126
An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion support to and fro...
Definition: utf8.h:70
UTF8 & operator=(const wxString &o)
Definition: utf8.cpp:58
UTF8()
Definition: utf8.h:93
const char * c_str() const
Definition: utf8.h:102
std::string m_s
Definition: utf8.h:304
bool IsUTF8(const char *aString)
Test a C string to see if it is UTF8 encoded.
Definition: utf8.cpp:180
UTF8 & operator+=(const UTF8 &str)
Definition: utf8.h:123
wxString wx_str() const
Definition: utf8.cpp:46
std::string substr(size_t pos=0, size_t len=npos) const
Definition: utf8.h:178
static int uni_forward(const unsigned char *aSequence, unsigned *aResult=nullptr)
Advance over a single UTF8 encoded multibyte character, capturing the Unicode character as it goes,...
Definition: utf8.cpp:68