|
@@ -5,390 +5,316 @@
|
5
|
5
|
*/
|
6
|
6
|
package org.openzen.zenscript.shared;
|
7
|
7
|
|
|
8
|
+import java.io.IOException;
|
|
9
|
+import java.io.InputStream;
|
|
10
|
+import java.util.Collection;
|
|
11
|
+import java.util.HashMap;
|
|
12
|
+import java.util.Map;
|
|
13
|
+import java.util.Properties;
|
|
14
|
+import java.util.logging.Level;
|
|
15
|
+import java.util.logging.Logger;
|
|
16
|
+import java.util.regex.Pattern;
|
|
17
|
+
|
8
|
18
|
/**
|
9
|
19
|
*
|
10
|
20
|
* @author Hoofdgebruiker
|
11
|
21
|
*/
|
12
|
22
|
public class StringUtils {
|
|
23
|
+ private static final Map<String, CharacterEntity> NAMED_CHARACTER_ENTITIES;
|
|
24
|
+ private static final Pattern MATCH_ACCENTS = Pattern.compile("\\p{M}");
|
|
25
|
+
|
|
26
|
+ static
|
|
27
|
+ {
|
|
28
|
+ NAMED_CHARACTER_ENTITIES = new HashMap<>();
|
|
29
|
+
|
|
30
|
+ Properties properties = new Properties();
|
|
31
|
+ try {
|
|
32
|
+ InputStream input = String.class.getResourceAsStream("/org/openzen/zenscript/shared/characterEntities.properties");
|
|
33
|
+ if (input != null)
|
|
34
|
+ properties.load(input);
|
|
35
|
+ else
|
|
36
|
+ System.out.println("Warning: could not load character entities");
|
|
37
|
+ } catch (IOException ex) {
|
|
38
|
+ Logger.getLogger(StringUtils.class.getName()).log(Level.SEVERE, null, ex);
|
|
39
|
+ }
|
|
40
|
+
|
|
41
|
+ for (Object okey : properties.keySet()) {
|
|
42
|
+ String key = okey.toString();
|
|
43
|
+ char value = (char) Integer.parseInt(properties.getProperty(key));
|
|
44
|
+ CharacterEntity entity = new CharacterEntity(key, value);
|
|
45
|
+ NAMED_CHARACTER_ENTITIES.put(entity.stringValue, entity);
|
|
46
|
+ }
|
|
47
|
+ }
|
13
|
48
|
|
14
|
49
|
/**
|
|
50
|
+ * Left pads (prefixes) a string with characters until it reaches the given string
|
|
51
|
+ * length. Does not do anything if the string length >= given length.
|
15
|
52
|
*
|
16
|
|
- * unescape_perl_string()
|
17
|
|
- *
|
18
|
|
- * Tom Christiansen <tchrist@perl.com> Sun Nov 28 12:55:24 MST 2010
|
19
|
|
- *
|
20
|
|
- * It's completely ridiculous that there's no standard unescape_java_string
|
21
|
|
- * function. Since I have to do the damn thing myself, I might as well make
|
22
|
|
- * it halfway useful by supporting things Java was too stupid to consider in
|
23
|
|
- * strings:
|
24
|
|
- *
|
25
|
|
- * => "?" items are additions to Java string escapes but normal in Java
|
26
|
|
- * regexes
|
27
|
|
- *
|
28
|
|
- * => "!" items are also additions to Java regex escapes
|
29
|
|
- *
|
30
|
|
- * Standard singletons: ?\a ?\e \f \n \r \t
|
31
|
|
- *
|
32
|
|
- * NB: \b is unsupported as backspace so it can pass-through to the regex
|
33
|
|
- * translator untouched; I refuse to make anyone doublebackslash it as
|
34
|
|
- * doublebackslashing is a Java idiocy I desperately wish would die out.
|
35
|
|
- * There are plenty of other ways to write it:
|
|
53
|
+ * @param value value to be padded
|
|
54
|
+ * @param length desired string length
|
|
55
|
+ * @param c padding character
|
|
56
|
+ * @return padded string
|
|
57
|
+ */
|
|
58
|
+ public static String lpad(String value, int length, char c)
|
|
59
|
+ {
|
|
60
|
+ if (value.length() >= length)
|
|
61
|
+ return value;
|
|
62
|
+
|
|
63
|
+ return times(c, length - value.length()) + value;
|
|
64
|
+ }
|
|
65
|
+
|
|
66
|
+ /**
|
|
67
|
+ * Right pads (suffixes) a string with characters until it reaches the given
|
|
68
|
+ * string length. Does not do anything if the string length >= given length.
|
36
|
69
|
*
|
37
|
|
- * \cH, \12, \012, \x08 \x{8}, \u0008, \U00000008
|
|
70
|
+ * @param value value to be padded
|
|
71
|
+ * @param length desired string length
|
|
72
|
+ * @param c padding character
|
|
73
|
+ * @return padded string
|
|
74
|
+ */
|
|
75
|
+ public static String rpad(String value, int length, char c)
|
|
76
|
+ {
|
|
77
|
+ if (value.length() >= length)
|
|
78
|
+ return value;
|
|
79
|
+
|
|
80
|
+ return value + times(c, length - value.length());
|
|
81
|
+ }
|
|
82
|
+
|
|
83
|
+ /**
|
|
84
|
+ * Constructs a string with count times the given character.
|
38
|
85
|
*
|
39
|
|
- * Octal escapes: \0 \0N \0NN \N \NN \NNN Can range up to !\777 not \377
|
|
86
|
+ * @param c filling character
|
|
87
|
+ * @param count character count
|
|
88
|
+ * @return string value
|
|
89
|
+ */
|
|
90
|
+ public static String times(char c, int count)
|
|
91
|
+ {
|
|
92
|
+ char[] value = new char[count];
|
|
93
|
+ for (int i = 0; i < count; i++) {
|
|
94
|
+ value[i] = c;
|
|
95
|
+ }
|
|
96
|
+ return new String(value);
|
|
97
|
+ }
|
|
98
|
+
|
|
99
|
+ /**
|
|
100
|
+ * Unescapes a string escaped in one of following ways:
|
40
|
101
|
*
|
41
|
|
- * TODO: add !\o{NNNNN} last Unicode is 4177777 maxint is 37777777777
|
|
102
|
+ * <ul>
|
|
103
|
+ * <li>A string escaped with single quotes (<code>'Hello "my" world'</code>)</li>
|
|
104
|
+ * <li>A string escaped with double quotes (<code>"Hello 'my' world"</code>)</li>
|
|
105
|
+ * <li>A near-literal string (<code>@"C:\Program Files\"</code>) in which escape sequences
|
|
106
|
+ * aren't processed but the " character cannot occur</li>
|
|
107
|
+ * </ul>
|
42
|
108
|
*
|
43
|
|
- * Control chars: ?\cX Means: ord(X) ^ ord('@')
|
|
109
|
+ * The following escape sequences are recognized:
|
|
110
|
+ * <ul>
|
|
111
|
+ * <li>\\</li>
|
|
112
|
+ * <li>\'</li>
|
|
113
|
+ * <li>\"</li>
|
|
114
|
+ * <li>\&namedCharacterEntity; (note that although redundant, \&#ddd; and \&#xXXXX; are also allowed)</li>
|
|
115
|
+ * <li>\t</li>
|
|
116
|
+ * <li>\n</li>
|
|
117
|
+ * <li>\r</li>
|
|
118
|
+ * <li>\b</li>
|
|
119
|
+ * <li>\f</li>
|
|
120
|
+ * <li>\&uXXXX for unicode character points</li>
|
|
121
|
+ * </ul>
|
44
|
122
|
*
|
45
|
|
- * Old hex escapes: \xXX unbraced must be 2 xdigits
|
|
123
|
+ * @param escapedString escaped string
|
|
124
|
+ * @return unescaped string
|
|
125
|
+ */
|
|
126
|
+ public static String unescape(String escapedString)
|
|
127
|
+ {
|
|
128
|
+ if (escapedString.length() < 2)
|
|
129
|
+ throw new IllegalArgumentException("String is not quoted");
|
|
130
|
+
|
|
131
|
+ boolean isLiteral = escapedString.charAt(0) == '@';
|
|
132
|
+ if (isLiteral)
|
|
133
|
+ escapedString = escapedString.substring(1);
|
|
134
|
+
|
|
135
|
+ if (escapedString.charAt(0) != '"' && escapedString.charAt(0) != '\'')
|
|
136
|
+ throw new IllegalArgumentException("String is not quoted");
|
|
137
|
+
|
|
138
|
+ char quoteCharacter = escapedString.charAt(0);
|
|
139
|
+ if (escapedString.charAt(escapedString.length() - 1) != quoteCharacter)
|
|
140
|
+ throw new IllegalArgumentException("Unbalanced quotes");
|
|
141
|
+
|
|
142
|
+ if (isLiteral)
|
|
143
|
+ return escapedString.substring(1, escapedString.length() - 1);
|
|
144
|
+
|
|
145
|
+ StringBuilder result = new StringBuilder(escapedString.length() - 2);
|
|
146
|
+
|
|
147
|
+ for (int i = 1; i < escapedString.length() - 1; i++) {
|
|
148
|
+ if (escapedString.charAt(i) == '\\') {
|
|
149
|
+ if (i >= escapedString.length() - 1)
|
|
150
|
+ throw new IllegalArgumentException("Unfinished escape sequence");
|
|
151
|
+
|
|
152
|
+ switch (escapedString.charAt(i + 1)) {
|
|
153
|
+ case '\\': i++; result.append('\\'); break;
|
|
154
|
+ case '&':
|
|
155
|
+ CharacterEntity characterEntity = readCharacterEntity(escapedString, i + 1);
|
|
156
|
+ i += characterEntity.stringValue.length() + 2;
|
|
157
|
+ result.append(characterEntity.charValue);
|
|
158
|
+ break;
|
|
159
|
+ case 't': i++; result.append('\t'); break;
|
|
160
|
+ case 'r': i++; result.append('\r'); break;
|
|
161
|
+ case 'n': i++; result.append('\n'); break;
|
|
162
|
+ case 'b': i++; result.append('\b'); break;
|
|
163
|
+ case 'f': i++; result.append('\f'); break;
|
|
164
|
+ case '"': i++; result.append('\"'); break;
|
|
165
|
+ case '\'': i++; result.append('\''); break;
|
|
166
|
+ case 'u':
|
|
167
|
+ if (i >= escapedString.length() - 5)
|
|
168
|
+ throw new IllegalArgumentException("Unfinished escape sequence");
|
|
169
|
+ int hex0 = readHexCharacter(escapedString.charAt(i + 2));
|
|
170
|
+ int hex1 = readHexCharacter(escapedString.charAt(i + 3));
|
|
171
|
+ int hex2 = readHexCharacter(escapedString.charAt(i + 4));
|
|
172
|
+ int hex3 = readHexCharacter(escapedString.charAt(i + 5));
|
|
173
|
+ i += 5;
|
|
174
|
+ result.append((hex0 << 12) | (hex1 << 8) | (hex2 << 4) | hex3);
|
|
175
|
+ default:
|
|
176
|
+ throw new IllegalArgumentException("Illegal escape sequence");
|
|
177
|
+ }
|
|
178
|
+ }
|
|
179
|
+ else
|
|
180
|
+ result.append(escapedString.charAt(i));
|
|
181
|
+ }
|
|
182
|
+
|
|
183
|
+ return result.toString();
|
|
184
|
+ }
|
|
185
|
+
|
|
186
|
+ /**
|
|
187
|
+ * Escapes special characters in the given string, including ". (but not ').
|
|
188
|
+ * Adds opening and closing quotes.
|
46
|
189
|
*
|
47
|
|
- * Perl hex escapes: !\x{XXX} braced may be 1-8 xdigits NB: proper Unicode
|
48
|
|
- * never needs more than 6, as highest valid codepoint is 0x10FFFF, not
|
49
|
|
- * maxint 0xFFFFFFFF
|
|
190
|
+ * @param value value to be escaped
|
|
191
|
+ * @param quote character (' or ")
|
|
192
|
+ * @param escapeUnicode true to escape any non-ascii value, false to leave them be
|
|
193
|
+ * @return escaped value
|
|
194
|
+ */
|
|
195
|
+ public static String escape(String value, char quote, boolean escapeUnicode)
|
|
196
|
+ {
|
|
197
|
+ StringBuilder output = new StringBuilder();
|
|
198
|
+ output.append(quote);
|
|
199
|
+ for (char c : value.toCharArray()) {
|
|
200
|
+ switch (c) {
|
|
201
|
+ case '"': if (quote == '"') output.append("\\\""); break;
|
|
202
|
+ case '\'': if (quote == '\'') output.append("\\\'"); break;
|
|
203
|
+ case '\n': output.append("\\n"); break;
|
|
204
|
+ case '\r': output.append("\\r"); break;
|
|
205
|
+ case '\t': output.append("\\t"); break;
|
|
206
|
+ default:
|
|
207
|
+ if (escapeUnicode && c > 127) {
|
|
208
|
+ output.append("\\u");
|
|
209
|
+ output.append(lpad(Integer.toHexString(c), 4, '0'));
|
|
210
|
+ } else {
|
|
211
|
+ output.append(c);
|
|
212
|
+ }
|
|
213
|
+ }
|
|
214
|
+ }
|
|
215
|
+
|
|
216
|
+ output.append(quote);
|
|
217
|
+ return output.toString();
|
|
218
|
+ }
|
|
219
|
+
|
|
220
|
+ /**
|
|
221
|
+ * Reads a single hex digit and converts it to a value 0-15.
|
50
|
222
|
*
|
51
|
|
- * Lame Java escape: \[IDIOT JAVA PREPROCESSOR]uXXXX must be exactly 4
|
52
|
|
- * xdigits;
|
|
223
|
+ * @param hex hex digit
|
|
224
|
+ * @return converted value
|
|
225
|
+ */
|
|
226
|
+ public static int readHexCharacter(char hex)
|
|
227
|
+ {
|
|
228
|
+ if (hex >= '0' && hex <= '9')
|
|
229
|
+ return hex - '0';
|
|
230
|
+
|
|
231
|
+ if (hex >= 'A' && hex <= 'F')
|
|
232
|
+ return hex - 'A' + 10;
|
|
233
|
+
|
|
234
|
+ if (hex >= 'a' && hex <= 'f')
|
|
235
|
+ return hex - 'a' + 10;
|
|
236
|
+
|
|
237
|
+ throw new IllegalArgumentException("Illegal hex character: " + hex);
|
|
238
|
+ }
|
|
239
|
+
|
|
240
|
+ /**
|
|
241
|
+ * Retrieves all official named character entities.
|
53
|
242
|
*
|
54
|
|
- * I can't write XXXX in this comment where it belongs because the damned
|
55
|
|
- * Java Preprocessor can't mind its own business. Idiots!
|
|
243
|
+ * @return named character entities
|
|
244
|
+ */
|
|
245
|
+ public static Collection<CharacterEntity> getNamedCharacterEntities()
|
|
246
|
+ {
|
|
247
|
+ return NAMED_CHARACTER_ENTITIES.values();
|
|
248
|
+ }
|
|
249
|
+
|
|
250
|
+ /**
|
|
251
|
+ * Reads a single character entity (formatted as &characterEntity;) at the
|
|
252
|
+ * given string offset.
|
56
|
253
|
*
|
57
|
|
- * Lame Python escape: !\UXXXXXXXX must be exactly 8 xdigits
|
|
254
|
+ * The following formats are supported:
|
|
255
|
+ * <ul>
|
|
256
|
+ * <li>&namedCharacterEntity;</li>
|
|
257
|
+ * <li>&#ddd</li>
|
|
258
|
+ * <li>&#xXXXX</li>
|
|
259
|
+ * </ul>
|
58
|
260
|
*
|
59
|
|
- * TODO: Perl translation escapes: \Q \U \L \E \[IDIOT JAVA PREPROCESSOR]u
|
60
|
|
- * \l These are not so important to cover if you're passing the result to
|
61
|
|
- * Pattern.compile(), since it handles them for you further downstream. Hm,
|
62
|
|
- * what about \[IDIOT JAVA PREPROCESSOR]u?
|
|
261
|
+ * The returned value includes the character entity, without the enclosing
|
|
262
|
+ * & and ; characters.
|
63
|
263
|
*
|
64
|
|
- * @param oldstr
|
65
|
|
- * @return
|
|
264
|
+ * @param str string value to search in
|
|
265
|
+ * @param offset offset to look at
|
|
266
|
+ * @return character entity
|
|
267
|
+ * @throws IllegalArgumentException if the given string does not contain a
|
|
268
|
+ * valid character entity at the given position
|
66
|
269
|
*/
|
67
|
|
- public static String unescapeString(String oldstr) {
|
68
|
|
- if ((oldstr.charAt(0) != '"' || oldstr.charAt(oldstr.length() - 1) != '"')
|
69
|
|
- && (oldstr.charAt(0) != '\'' || oldstr.charAt(oldstr.length() - 1) != '\'')) {
|
70
|
|
- // TODO: error
|
71
|
|
- // throw new TweakerExecuteException("Not a valid string constant: "
|
72
|
|
- // + oldstr);
|
73
|
|
- }
|
74
|
|
- oldstr = oldstr.substring(1, oldstr.length() - 1);
|
75
|
|
-
|
76
|
|
- /*
|
77
|
|
- * In contrast to fixing Java's broken regex charclasses, this one need
|
78
|
|
- * be no bigger, as unescaping shrinks the string here, where in the
|
79
|
|
- * other one, it grows it.
|
80
|
|
- */
|
81
|
|
-
|
82
|
|
- StringBuilder newstr = new StringBuilder(oldstr.length());
|
83
|
|
-
|
84
|
|
- boolean saw_backslash = false;
|
85
|
|
-
|
86
|
|
- for (int i = 0; i < oldstr.length(); i++) {
|
87
|
|
- int cp = oldstr.codePointAt(i);
|
88
|
|
- if (oldstr.codePointAt(i) > Character.MAX_VALUE) {
|
89
|
|
- i++;
|
90
|
|
- /**** WE HATES UTF-16! WE HATES IT FOREVERSES!!! ****/
|
91
|
|
- }
|
92
|
|
-
|
93
|
|
- if (!saw_backslash) {
|
94
|
|
- if (cp == '\\') {
|
95
|
|
- saw_backslash = true;
|
96
|
|
- } else {
|
97
|
|
- newstr.append(Character.toChars(cp));
|
98
|
|
- }
|
99
|
|
- continue; /* switch */
|
100
|
|
- }
|
101
|
|
-
|
102
|
|
- if (cp == '\\') {
|
103
|
|
- saw_backslash = false;
|
104
|
|
- newstr.append('\\');
|
105
|
|
- continue; /* switch */
|
106
|
|
- }
|
107
|
|
-
|
108
|
|
- switch (cp) {
|
109
|
|
-
|
110
|
|
- case 'r':
|
111
|
|
- newstr.append('\r');
|
112
|
|
- break; /* switch */
|
113
|
|
-
|
114
|
|
- case 'n':
|
115
|
|
- newstr.append('\n');
|
116
|
|
- break; /* switch */
|
117
|
|
-
|
118
|
|
- case 'f':
|
119
|
|
- newstr.append('\f');
|
120
|
|
- break; /* switch */
|
121
|
|
-
|
122
|
|
- /* PASS a \b THROUGH!! */
|
123
|
|
- case 'b':
|
124
|
|
- newstr.append("\\b");
|
125
|
|
- break; /* switch */
|
126
|
|
-
|
127
|
|
- case 't':
|
128
|
|
- newstr.append('\t');
|
129
|
|
- break; /* switch */
|
130
|
|
-
|
131
|
|
- case 'a':
|
132
|
|
- newstr.append('\007');
|
133
|
|
- break; /* switch */
|
134
|
|
-
|
135
|
|
- case 'e':
|
136
|
|
- newstr.append('\033');
|
137
|
|
- break; /* switch */
|
138
|
|
-
|
139
|
|
- /*
|
140
|
|
- * A "control" character is what you get when you xor its
|
141
|
|
- * codepoint with '@'==64. This only makes sense for ASCII, and
|
142
|
|
- * may not yield a "control" character after all.
|
143
|
|
- *
|
144
|
|
- * Strange but true: "\c{" is ";", "\c}" is "=", etc.
|
145
|
|
- */
|
146
|
|
- case 'c': {
|
147
|
|
- if (++i == oldstr.length()) {
|
148
|
|
- // TODO: error
|
149
|
|
- // throw new TweakerExecuteException("trailing \\c");
|
150
|
|
- }
|
151
|
|
- cp = oldstr.codePointAt(i);
|
152
|
|
- /*
|
153
|
|
- * don't need to grok surrogates, as next line blows them up
|
154
|
|
- */
|
155
|
|
- if (cp > 0x7f) {
|
156
|
|
- // TODO: error
|
157
|
|
- // throw new TweakerExecuteException(
|
158
|
|
- // "expected ASCII after \\c");
|
159
|
|
- }
|
160
|
|
- newstr.append(Character.toChars(cp ^ 64));
|
161
|
|
- break; /* switch */
|
162
|
|
- }
|
163
|
|
-
|
164
|
|
- case '8':
|
165
|
|
- case '9':
|
166
|
|
- // TODO: error
|
167
|
|
- // throw new TweakerExecuteException("illegal octal digit");
|
168
|
|
- /* NOTREACHED */
|
169
|
|
-
|
170
|
|
- /*
|
171
|
|
- * may be 0 to 2 octal digits following this one so back up
|
172
|
|
- * one for fallthrough to next case; unread this digit and
|
173
|
|
- * fall through to next case.
|
174
|
|
- */
|
175
|
|
- case '1':
|
176
|
|
- case '2':
|
177
|
|
- case '3':
|
178
|
|
- case '4':
|
179
|
|
- case '5':
|
180
|
|
- case '6':
|
181
|
|
- case '7':
|
182
|
|
- --i;
|
183
|
|
- /* FALLTHROUGH */
|
184
|
|
-
|
185
|
|
- /*
|
186
|
|
- * Can have 0, 1, or 2 octal digits following a 0 this
|
187
|
|
- * permits larger values than octal 377, up to octal 777.
|
188
|
|
- */
|
189
|
|
- case '0': {
|
190
|
|
- if (i + 1 == oldstr.length()) {
|
191
|
|
- /* found \0 at end of string */
|
192
|
|
- newstr.append(Character.toChars(0));
|
193
|
|
- break; /* switch */
|
194
|
|
- }
|
195
|
|
- i++;
|
196
|
|
- int digits = 0;
|
197
|
|
- int j;
|
198
|
|
- for (j = 0; j <= 2; j++) {
|
199
|
|
- if (i + j == oldstr.length()) {
|
200
|
|
- break; /* for */
|
201
|
|
- }
|
202
|
|
- /* safe because will unread surrogate */
|
203
|
|
- int ch = oldstr.charAt(i + j);
|
204
|
|
- if (ch < '0' || ch > '7') {
|
205
|
|
- break; /* for */
|
206
|
|
- }
|
207
|
|
- digits++;
|
208
|
|
- }
|
209
|
|
- if (digits == 0) {
|
210
|
|
- --i;
|
211
|
|
- newstr.append('\0');
|
212
|
|
- break; /* switch */
|
213
|
|
- }
|
214
|
|
- int value = 0;
|
215
|
|
- try {
|
216
|
|
- value = Integer
|
217
|
|
- .parseInt(oldstr.substring(i, i + digits), 8);
|
218
|
|
- } catch (NumberFormatException nfe) {
|
219
|
|
- // TODO: error
|
220
|
|
- // throw new TweakerExecuteException(
|
221
|
|
- // "invalid octal value for \\0 escape");
|
222
|
|
- }
|
223
|
|
- newstr.append(Character.toChars(value));
|
224
|
|
- i += digits - 1;
|
225
|
|
- break; /* switch */
|
226
|
|
- } /* end case '0' */
|
227
|
|
-
|
228
|
|
- case 'x': {
|
229
|
|
- if (i + 2 > oldstr.length()) {
|
230
|
|
- // TODO: error
|
231
|
|
- // throw new TweakerExecuteException(
|
232
|
|
- // "string too short for \\x escape");
|
233
|
|
- }
|
234
|
|
- i++;
|
235
|
|
- boolean saw_brace = false;
|
236
|
|
- if (oldstr.charAt(i) == '{') {
|
237
|
|
- /* ^^^^^^ ok to ignore surrogates here */
|
238
|
|
- i++;
|
239
|
|
- saw_brace = true;
|
240
|
|
- }
|
241
|
|
- int j;
|
242
|
|
- for (j = 0; j < 8; j++) {
|
243
|
|
-
|
244
|
|
- if (!saw_brace && j == 2) {
|
245
|
|
- break; /* for */
|
246
|
|
- }
|
247
|
|
-
|
248
|
|
- /*
|
249
|
|
- * ASCII test also catches surrogates
|
250
|
|
- */
|
251
|
|
- int ch = oldstr.charAt(i + j);
|
252
|
|
- if (ch > 127) {
|
253
|
|
- // TODO: error
|
254
|
|
- // throw new TweakerExecuteException(
|
255
|
|
- // "illegal non-ASCII hex digit in \\x escape");
|
256
|
|
- }
|
257
|
|
-
|
258
|
|
- if (saw_brace && ch == '}') {
|
259
|
|
- break; /* for */
|
260
|
|
- }
|
261
|
|
-
|
262
|
|
- if (!((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'))) {
|
263
|
|
- // TODO: error
|
264
|
|
- // throw new TweakerExecuteException(String.format(
|
265
|
|
- // "illegal hex digit #%d '%c' in \\x", ch, ch));
|
266
|
|
- }
|
267
|
|
-
|
268
|
|
- }
|
269
|
|
- if (j == 0) {
|
270
|
|
- // TODO: error
|
271
|
|
- // throw new TweakerExecuteException(
|
272
|
|
- // "empty braces in \\x{} escape");
|
273
|
|
- }
|
274
|
|
- int value = 0;
|
275
|
|
- try {
|
276
|
|
- value = Integer.parseInt(oldstr.substring(i, i + j), 16);
|
277
|
|
- } catch (NumberFormatException nfe) {
|
278
|
|
- // TODO: error
|
279
|
|
- // throw new TweakerExecuteException(
|
280
|
|
- // "invalid hex value for \\x escape");
|
281
|
|
- }
|
282
|
|
- newstr.append(Character.toChars(value));
|
283
|
|
- if (saw_brace) {
|
284
|
|
- j++;
|
285
|
|
- }
|
286
|
|
- i += j - 1;
|
287
|
|
- break; /* switch */
|
288
|
|
- }
|
289
|
|
-
|
290
|
|
- case 'u': {
|
291
|
|
- if (i + 4 > oldstr.length()) {
|
292
|
|
- // TODO: error
|
293
|
|
- // throw new TweakerExecuteException(
|
294
|
|
- // "string too short for \\u escape");
|
295
|
|
- }
|
296
|
|
- i++;
|
297
|
|
- int j;
|
298
|
|
- for (j = 0; j < 4; j++) {
|
299
|
|
- /* this also handles the surrogate issue */
|
300
|
|
- if (oldstr.charAt(i + j) > 127) {
|
301
|
|
- // TODO: error
|
302
|
|
- // throw new TweakerExecuteException(
|
303
|
|
- // "illegal non-ASCII hex digit in \\u escape");
|
304
|
|
- }
|
305
|
|
- }
|
306
|
|
- int value = 0;
|
307
|
|
- try {
|
308
|
|
- value = Integer.parseInt(oldstr.substring(i, i + j), 16);
|
309
|
|
- } catch (NumberFormatException nfe) {
|
310
|
|
- // TODO: error
|
311
|
|
- // throw new TweakerExecuteException(
|
312
|
|
- // "invalid hex value for \\u escape");
|
313
|
|
- }
|
314
|
|
- newstr.append(Character.toChars(value));
|
315
|
|
- i += j - 1;
|
316
|
|
- break; /* switch */
|
317
|
|
- }
|
318
|
|
-
|
319
|
|
- case 'U': {
|
320
|
|
- if (i + 8 > oldstr.length()) {
|
321
|
|
- // TODO: error
|
322
|
|
- // throw new TweakerExecuteException(
|
323
|
|
- // "string too short for \\U escape");
|
324
|
|
- }
|
325
|
|
- i++;
|
326
|
|
- int j;
|
327
|
|
- for (j = 0; j < 8; j++) {
|
328
|
|
- /* this also handles the surrogate issue */
|
329
|
|
- if (oldstr.charAt(i + j) > 127) {
|
330
|
|
- // TODO: error
|
331
|
|
- // throw new TweakerExecuteException(
|
332
|
|
- // "illegal non-ASCII hex digit in \\U escape");
|
333
|
|
- }
|
334
|
|
- }
|
335
|
|
- int value = 0;
|
336
|
|
- try {
|
337
|
|
- value = Integer.parseInt(oldstr.substring(i, i + j), 16);
|
338
|
|
- } catch (NumberFormatException nfe) {
|
339
|
|
- // TODO: error
|
340
|
|
- // throw new TweakerExecuteException(
|
341
|
|
- // "invalid hex value for \\U escape");
|
342
|
|
- }
|
343
|
|
- newstr.append(Character.toChars(value));
|
344
|
|
- i += j - 1;
|
345
|
|
- break; /* switch */
|
346
|
|
- }
|
347
|
|
-
|
348
|
|
- default:
|
349
|
|
- newstr.append('\\');
|
350
|
|
- newstr.append(Character.toChars(cp));
|
351
|
|
- /*
|
352
|
|
- * say(String.format(
|
353
|
|
- * "DEFAULT unrecognized escape %c passed through", cp));
|
354
|
|
- */
|
355
|
|
- break; /* switch */
|
356
|
|
-
|
|
270
|
+ public static CharacterEntity readCharacterEntity(String str, int offset)
|
|
271
|
+ {
|
|
272
|
+ if (offset + 3 >= str.length())
|
|
273
|
+ throw new IllegalArgumentException("Not a proper character entity");
|
|
274
|
+ if (str.charAt(offset) != '&')
|
|
275
|
+ throw new IllegalArgumentException("Not a proper character entity");
|
|
276
|
+
|
|
277
|
+ int semi = str.indexOf(';', offset);
|
|
278
|
+ if (semi < 0)
|
|
279
|
+ throw new IllegalArgumentException("Not a proper character entity");
|
|
280
|
+
|
|
281
|
+ String entity = str.substring(offset + 1, semi);
|
|
282
|
+ if (entity.isEmpty())
|
|
283
|
+ throw new IllegalArgumentException("Not a proper character entity");
|
|
284
|
+
|
|
285
|
+ if (NAMED_CHARACTER_ENTITIES.containsKey(entity))
|
|
286
|
+ return NAMED_CHARACTER_ENTITIES.get(entity);
|
|
287
|
+
|
|
288
|
+ if (entity.charAt(0) == '#') {
|
|
289
|
+ if (entity.length() < 2)
|
|
290
|
+ throw new IllegalArgumentException("Not a proper character entity");
|
|
291
|
+
|
|
292
|
+ if (str.charAt(1) == 'x') {
|
|
293
|
+ // hex character entity
|
|
294
|
+ if (entity.length() != 7)
|
|
295
|
+ throw new IllegalArgumentException("Not a proper character entity");
|
|
296
|
+
|
|
297
|
+ int ivalue = Integer.parseInt(entity.substring(2), 16);
|
|
298
|
+ return new CharacterEntity(entity, (char) ivalue);
|
|
299
|
+ } else {
|
|
300
|
+ // decimal character entity
|
|
301
|
+ int ivalue = Integer.parseInt(entity.substring(1));
|
|
302
|
+ return new CharacterEntity(entity, (char) ivalue);
|
357
|
303
|
}
|
358
|
|
- saw_backslash = false;
|
359
|
|
- }
|
360
|
|
-
|
361
|
|
- /* weird to leave one at the end */
|
362
|
|
- if (saw_backslash) {
|
363
|
|
- newstr.append('\\');
|
364
|
304
|
}
|
365
|
305
|
|
366
|
|
- String result = newstr.toString();
|
367
|
|
- return result;
|
|
306
|
+ throw new IllegalArgumentException("Not a valid named character entity");
|
368
|
307
|
}
|
369
|
|
-
|
370
|
|
- /*
|
371
|
|
- * Return a string "U+XX.XXX.XXXX" etc, where each XX set is the xdigits of
|
372
|
|
- * the logical Unicode code point. No bloody brain-damaged UTF-16 surrogate
|
373
|
|
- * crap, just true logical characters.
|
374
|
|
- */
|
375
|
|
- private static String uniplus(String s) {
|
376
|
|
- if (s.length() == 0) {
|
377
|
|
- return "";
|
378
|
|
- }
|
379
|
|
- /* This is just the minimum; sb will grow as needed. */
|
380
|
|
- StringBuilder sb = new StringBuilder(2 + 3 * s.length());
|
381
|
|
- sb.append("U+");
|
382
|
|
- for (int i = 0; i < s.length(); i++) {
|
383
|
|
- sb.append(String.format("%X", s.codePointAt(i)));
|
384
|
|
- if (s.codePointAt(i) > Character.MAX_VALUE) {
|
385
|
|
- i++;
|
386
|
|
- /**** WE HATES UTF-16! WE HATES IT FOREVERSES!!! ****/
|
387
|
|
- }
|
388
|
|
- if (i + 1 < s.length()) {
|
389
|
|
- sb.append(".");
|
390
|
|
- }
|
|
308
|
+
|
|
309
|
+ public static class CharacterEntity
|
|
310
|
+ {
|
|
311
|
+ public char charValue;
|
|
312
|
+ public String stringValue;
|
|
313
|
+
|
|
314
|
+ public CharacterEntity(String stringValue, char charValue)
|
|
315
|
+ {
|
|
316
|
+ this.charValue = charValue;
|
|
317
|
+ this.stringValue = stringValue;
|
391
|
318
|
}
|
392
|
|
- return sb.toString();
|
393
|
319
|
}
|
394
|
320
|
}
|