KiCad PCB EDA Suite
Loading...
Searching...
No Matches
utf8.cpp
Go to the documentation of this file.
1/*
2 * This program source code file is part of KiCad, a free EDA CAD application.
3 *
4 * Copyright (C) 2013-2017 SoftPLC Corporation, Dick Hollenbeck <[email protected]>
5 * Copyright (C) 2013-2021 KiCad Developers, see AUTHORS.txt for contributors.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, you may find one here:
19 * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
20 * or you may search the http://www.gnu.org website for the version 2 license,
21 * or you may write to the Free Software Foundation, Inc.,
22 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 */
24
25#include <core/utf8.h>
26#include <wx/strconv.h>
27#include <wx/buffer.h>
28#include <vector>
29
30#include <cassert>
31
32
33/*
34 These are not inlined so that code space is saved by encapsulating the
35 creation of intermediate objects and the referencing of wxConvUTF8.
36*/
37
38
39UTF8::UTF8( const wxString& o ) :
40 m_s( (const char*) o.utf8_str() )
41{
42}
43
44
45wxString UTF8::wx_str() const
46{
47 return wxString( c_str(), wxConvUTF8 );
48}
49
50
51UTF8::operator wxString () const
52{
53 return wxString( c_str(), wxConvUTF8 );
54}
55
56
57UTF8& UTF8::operator=( const wxString& o )
58{
59 m_s = (const char*) o.utf8_str();
60 return *this;
61}
62
63
64// There is no wxWidgets function that does this, because wchar_t is 16 bits
65// on windows and wx wants to encode the output in UTF16 for such.
66
67int UTF8::uni_forward( const unsigned char* aSequence, unsigned* aResult )
68{
69 unsigned ch = *aSequence;
70
71 if( ch < 0x80 )
72 {
73 if( aResult )
74 *aResult = ch;
75 return 1;
76 }
77
78 const unsigned char* s = aSequence;
79
80 static const unsigned char utf8_len[] = {
81 // Map encoded prefix byte to sequence length. Zero means
82 // illegal prefix. See RFC 3629 for details
83 /*
84 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00-0F
85 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
86 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
88 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
89 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
90 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
91 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70-7F
92 */
93 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80-8F
94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0-BF
97 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0-C1 + C2-CF
98 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0-DF
99 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0-EF
100 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F0-F4 + F5-FF
101 };
102
103 int len = utf8_len[ *s - 0x80 /* top half of table is missing */ ];
104
105 switch( len )
106 {
107 default:
108 case 0:
109 if( aResult )
110 wxFAIL_MSG( wxS( "uni_forward: invalid start byte" ) );
111
112 return 0;
113 break;
114
115 case 2:
116 if( ( s[1] & 0xc0 ) != 0x80 )
117 {
118 if( aResult )
119 wxFAIL_MSG( wxS( "uni_forward: invalid continuation byte" ) );
120
121 return 0;
122 }
123
124 ch = ((s[0] & 0x1f) << 6) +
125 ((s[1] & 0x3f) << 0);
126
127 // assert( ch > 0x007F && ch <= 0x07FF );
128 break;
129
130 case 3:
131 if( (s[1] & 0xc0) != 0x80 ||
132 (s[2] & 0xc0) != 0x80 ||
133 (s[0] == 0xE0 && s[1] < 0xA0)
134 // || (s[0] == 0xED && s[1] > 0x9F)
135 )
136 {
137 if( aResult )
138 wxFAIL_MSG( wxS( "uni_forward: invalid continuation byte" ) );
139
140 return 0;
141 }
142
143 ch = ((s[0] & 0x0f) << 12) +
144 ((s[1] & 0x3f) << 6 ) +
145 ((s[2] & 0x3f) << 0 );
146
147 // assert( ch > 0x07FF && ch <= 0xFFFF );
148 break;
149
150 case 4:
151 if( (s[1] & 0xc0) != 0x80 ||
152 (s[2] & 0xc0) != 0x80 ||
153 (s[3] & 0xc0) != 0x80 ||
154 (s[0] == 0xF0 && s[1] < 0x90) ||
155 (s[0] == 0xF4 && s[1] > 0x8F) )
156 {
157 if( aResult )
158 wxFAIL_MSG( wxS( "uni_forward: invalid continuation byte" ) );
159
160 return 0;
161 }
162
163 ch = ((s[0] & 0x7) << 18) +
164 ((s[1] & 0x3f) << 12) +
165 ((s[2] & 0x3f) << 6 ) +
166 ((s[3] & 0x3f) << 0 );
167
168 // assert( ch > 0xFFFF && ch <= 0x10ffff );
169 break;
170 }
171
172 if( aResult )
173 *aResult = ch;
174
175 return len;
176}
177
178
179bool IsUTF8( const char* aString )
180{
181 int len = strlen( aString );
182
183 if( len )
184 {
185 const unsigned char* next = (unsigned char*) aString;
186 const unsigned char* end = next + len;
187
188 while( next < end )
189 {
190 int charLen = UTF8::uni_forward( next, nullptr );
191
192 if( charLen == 0 )
193 return false;
194
195 next += charLen;
196 }
197
198 // uni_forward() should find the exact end if it is truly UTF8
199 if( next > end )
200 return false;
201 }
202
203 return true;
204}
205
206
207UTF8::UTF8( const wchar_t* txt )
208{
209 try
210 {
211 std::vector< char > temp( wcslen( txt ) * 4 + 1 );
212 wxConvUTF8.WC2MB( temp.data(), txt, temp.size() );
213 m_s.assign( temp.data() );
214 }
215 catch(...)
216 {
217 auto string = wxSafeConvertWX2MB( txt );
218 m_s.assign( string );
219 }
220
221 m_s.shrink_to_fit();
222}
223
224
225UTF8& UTF8::operator+=( unsigned w_ch )
226{
227 if( w_ch <= 0x7F )
228 {
229 m_s.operator+=( char( w_ch ) );
230 }
231 else
232 {
233 wchar_t wide_chr[2]; // buffer to store wide chars (UTF16) read from aText
234 wide_chr[1] = 0;
235 wide_chr[0] = w_ch;
236 UTF8 substr( wide_chr );
237 m_s += substr.m_s;
238 }
239
240 return *this;
241}
An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion support to and fro...
Definition: utf8.h:71
UTF8 & operator+=(const UTF8 &str)
Definition: utf8.h:123
std::string m_s
Definition: utf8.h:304
std::string substr(size_t pos=0, size_t len=npos) const
Definition: utf8.h:178
static int uni_forward(const unsigned char *aSequence, unsigned *aResult=nullptr)
Advance over a single UTF8 encoded multibyte character, capturing the Unicode character as it goes,...
Definition: utf8.cpp:67
const char * c_str() const
Definition: utf8.h:102
wxString wx_str() const
Definition: utf8.cpp:45
UTF8 & operator=(const wxString &o)
Definition: utf8.cpp:57
UTF8()
Definition: utf8.h:93
CITER next(CITER it)
Definition: ptree.cpp:126
bool IsUTF8(const char *aString)
Test a C string to see if it is UTF8 encoded.
Definition: utf8.cpp:179