KiCad PCB EDA Suite
utf8.cpp
Go to the documentation of this file.
1/*
2 * This program source code file is part of KiCad, a free EDA CAD application.
3 *
4 * Copyright (C) 2013-2017 SoftPLC Corporation, Dick Hollenbeck <[email protected]>
5 * Copyright (C) 2013-2021 KiCad Developers, see AUTHORS.txt for contributors.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, you may find one here:
19 * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
20 * or you may search the http://www.gnu.org website for the version 2 license,
21 * or you may write to the Free Software Foundation, Inc.,
22 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23 */
24
25#include <utf8.h>
26#include <ki_exception.h>
27#include <wx/strconv.h>
28#include <wx/buffer.h>
29#include <vector>
30
31#include <cassert>
32
33
34/*
35 These are not inlined so that code space is saved by encapsulating the
36 creation of intermediate objects and the referencing of wxConvUTF8.
37*/
38
39
40UTF8::UTF8( const wxString& o ) :
41 m_s( (const char*) o.utf8_str() )
42{
43}
44
45
46wxString UTF8::wx_str() const
47{
48 return wxString( c_str(), wxConvUTF8 );
49}
50
51
52UTF8::operator wxString () const
53{
54 return wxString( c_str(), wxConvUTF8 );
55}
56
57
58UTF8& UTF8::operator=( const wxString& o )
59{
60 m_s = (const char*) o.utf8_str();
61 return *this;
62}
63
64
65// There is no wxWidgets function that does this, because wchar_t is 16 bits
66// on windows and wx wants to encode the output in UTF16 for such.
67
68int UTF8::uni_forward( const unsigned char* aSequence, unsigned* aResult )
69{
70 unsigned ch = *aSequence;
71
72 if( ch < 0x80 )
73 {
74 if( aResult )
75 *aResult = ch;
76 return 1;
77 }
78
79 const unsigned char* s = aSequence;
80
81 static const unsigned char utf8_len[] = {
82 // Map encoded prefix byte to sequence length. Zero means
83 // illegal prefix. See RFC 3629 for details
84 /*
85 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00-0F
86 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
88 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
89 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
90 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
91 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
92 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70-7F
93 */
94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80-8F
95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0-BF
98 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0-C1 + C2-CF
99 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0-DF
100 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0-EF
101 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F0-F4 + F5-FF
102 };
103
104 int len = utf8_len[ *s - 0x80 /* top half of table is missing */ ];
105
106 switch( len )
107 {
108 default:
109 case 0:
110 if( aResult )
111 wxFAIL_MSG( wxS( "uni_forward: invalid start byte" ) );
112
113 return 0;
114 break;
115
116 case 2:
117 if( ( s[1] & 0xc0 ) != 0x80 )
118 {
119 if( aResult )
120 wxFAIL_MSG( wxS( "uni_forward: invalid continuation byte" ) );
121
122 return 0;
123 }
124
125 ch = ((s[0] & 0x1f) << 6) +
126 ((s[1] & 0x3f) << 0);
127
128 // assert( ch > 0x007F && ch <= 0x07FF );
129 break;
130
131 case 3:
132 if( (s[1] & 0xc0) != 0x80 ||
133 (s[2] & 0xc0) != 0x80 ||
134 (s[0] == 0xE0 && s[1] < 0xA0)
135 // || (s[0] == 0xED && s[1] > 0x9F)
136 )
137 {
138 if( aResult )
139 wxFAIL_MSG( wxS( "uni_forward: invalid continuation byte" ) );
140
141 return 0;
142 }
143
144 ch = ((s[0] & 0x0f) << 12) +
145 ((s[1] & 0x3f) << 6 ) +
146 ((s[2] & 0x3f) << 0 );
147
148 // assert( ch > 0x07FF && ch <= 0xFFFF );
149 break;
150
151 case 4:
152 if( (s[1] & 0xc0) != 0x80 ||
153 (s[2] & 0xc0) != 0x80 ||
154 (s[3] & 0xc0) != 0x80 ||
155 (s[0] == 0xF0 && s[1] < 0x90) ||
156 (s[0] == 0xF4 && s[1] > 0x8F) )
157 {
158 if( aResult )
159 wxFAIL_MSG( wxS( "uni_forward: invalid continuation byte" ) );
160
161 return 0;
162 }
163
164 ch = ((s[0] & 0x7) << 18) +
165 ((s[1] & 0x3f) << 12) +
166 ((s[2] & 0x3f) << 6 ) +
167 ((s[3] & 0x3f) << 0 );
168
169 // assert( ch > 0xFFFF && ch <= 0x10ffff );
170 break;
171 }
172
173 if( aResult )
174 *aResult = ch;
175
176 return len;
177}
178
179
180bool IsUTF8( const char* aString )
181{
182 int len = strlen( aString );
183
184 if( len )
185 {
186 const unsigned char* next = (unsigned char*) aString;
187 const unsigned char* end = next + len;
188
189 while( next < end )
190 {
191 int charLen = UTF8::uni_forward( next, nullptr );
192
193 if( charLen == 0 )
194 return false;
195
196 next += charLen;
197 }
198
199 // uni_forward() should find the exact end if it is truly UTF8
200 if( next > end )
201 return false;
202 }
203
204 return true;
205}
206
207
208UTF8::UTF8( const wchar_t* txt )
209{
210 try
211 {
212 std::vector< char > temp( wcslen( txt ) * 4 + 1 );
213 wxConvUTF8.WC2MB( temp.data(), txt, temp.size() );
214 m_s.assign( temp.data() );
215 }
216 catch(...)
217 {
218 auto string = wxSafeConvertWX2MB( txt );
219 m_s.assign( string );
220 }
221
222 m_s.shrink_to_fit();
223}
224
225
226UTF8& UTF8::operator+=( unsigned w_ch )
227{
228 if( w_ch <= 0x7F )
229 {
230 m_s.operator+=( char( w_ch ) );
231 }
232 else
233 {
234 //TODO: Remove wchar use. Replace with std::byte*
235 wchar_t wide_chr[2]; // buffer to store wide chars (UTF16) read from aText
236 wide_chr[1] = 0;
237 wide_chr[0] = w_ch;
238 UTF8 substr( wide_chr );
239 m_s += substr.m_s;
240 }
241
242 return *this;
243}
An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion support to and fro...
Definition: utf8.h:71
UTF8 & operator+=(const UTF8 &str)
Definition: utf8.h:123
std::string m_s
Definition: utf8.h:304
std::string substr(size_t pos=0, size_t len=npos) const
Definition: utf8.h:178
static int uni_forward(const unsigned char *aSequence, unsigned *aResult=nullptr)
Advance over a single UTF8 encoded multibyte character, capturing the Unicode character as it goes,...
Definition: utf8.cpp:68
const char * c_str() const
Definition: utf8.h:102
wxString wx_str() const
Definition: utf8.cpp:46
UTF8 & operator=(const wxString &o)
Definition: utf8.cpp:58
UTF8()
Definition: utf8.h:93
CITER next(CITER it)
Definition: ptree.cpp:126
bool IsUTF8(const char *aString)
Test a C string to see if it is UTF8 encoded.
Definition: utf8.cpp:180