Lines 71-76
Link Here
|
71 |
|
71 |
|
72 |
import org.apache.xerces.impl.XMLErrorReporter; |
72 |
import org.apache.xerces.impl.XMLErrorReporter; |
73 |
import org.apache.xerces.impl.io.ASCIIReader; |
73 |
import org.apache.xerces.impl.io.ASCIIReader; |
|
|
74 |
import org.apache.xerces.impl.io.UCSReader; |
74 |
import org.apache.xerces.impl.io.UTF8Reader; |
75 |
import org.apache.xerces.impl.io.UTF8Reader; |
75 |
import org.apache.xerces.impl.msg.XMLMessageFormatter; |
76 |
import org.apache.xerces.impl.msg.XMLMessageFormatter; |
76 |
|
77 |
|
Lines 112-118
Link Here
|
112 |
* @author Andy Clark, IBM |
113 |
* @author Andy Clark, IBM |
113 |
* @author Arnaud Le Hors, IBM |
114 |
* @author Arnaud Le Hors, IBM |
114 |
* |
115 |
* |
115 |
* @version $Id: XMLEntityManager.java,v 1.14 2001/12/17 16:41:12 neilg Exp $ |
116 |
* @version $Id: XMLEntityManager.java,v 1.16 2002/01/04 23:27:49 neilg Exp $ |
116 |
*/ |
117 |
*/ |
117 |
public class XMLEntityManager |
118 |
public class XMLEntityManager |
118 |
implements XMLComponent, XMLEntityResolver { |
119 |
implements XMLComponent, XMLEntityResolver { |
Lines 703-708
Link Here
|
703 |
final String systemId = xmlInputSource.getSystemId(); |
704 |
final String systemId = xmlInputSource.getSystemId(); |
704 |
String baseSystemId = xmlInputSource.getBaseSystemId(); |
705 |
String baseSystemId = xmlInputSource.getBaseSystemId(); |
705 |
String encoding = xmlInputSource.getEncoding(); |
706 |
String encoding = xmlInputSource.getEncoding(); |
|
|
707 |
Boolean isBigEndian = null; |
706 |
|
708 |
|
707 |
// create reader |
709 |
// create reader |
708 |
InputStream stream = null; |
710 |
InputStream stream = null; |
Lines 728-734
Link Here
|
728 |
b4[count] = (byte)stream.read(); |
730 |
b4[count] = (byte)stream.read(); |
729 |
} |
731 |
} |
730 |
if (count == 4) { |
732 |
if (count == 4) { |
731 |
encoding = getEncodingName(b4, count); |
733 |
Object [] encodingDesc = getEncodingName(b4, count); |
|
|
734 |
encoding = (String)(encodingDesc[0]); |
735 |
isBigEndian = (Boolean)(encodingDesc[1]); |
732 |
|
736 |
|
733 |
// removed use of pushback inputstream--neilg |
737 |
// removed use of pushback inputstream--neilg |
734 |
/***** |
738 |
/***** |
Lines 765-782
Link Here
|
765 |
// indirection to get at the underlying bytes. -Ac |
769 |
// indirection to get at the underlying bytes. -Ac |
766 |
|
770 |
|
767 |
// create reader from input stream |
771 |
// create reader from input stream |
768 |
reader = createReader(new RewindableInputStream(pbstream), encoding); |
772 |
reader = createReader(new RewindableInputStream(pbstream), encoding, isBigEndian); |
769 |
******/ |
773 |
******/ |
770 |
reader = createReader(stream, encoding); |
774 |
reader = createReader(stream, encoding, isBigEndian); |
771 |
} |
775 |
} |
772 |
else { |
776 |
else { |
773 |
reader = createReader(stream, encoding); |
777 |
reader = createReader(stream, encoding, isBigEndian); |
774 |
} |
778 |
} |
775 |
} |
779 |
} |
776 |
|
780 |
|
777 |
// use specified encoding |
781 |
// use specified encoding |
778 |
else { |
782 |
else { |
779 |
reader = createReader(stream, encoding); |
783 |
reader = createReader(stream, encoding, isBigEndian); |
780 |
} |
784 |
} |
781 |
|
785 |
|
782 |
// read one character at a time so we don't jump too far |
786 |
// read one character at a time so we don't jump too far |
Lines 1136-1150
Link Here
|
1136 |
|
1140 |
|
1137 |
/** |
1141 |
/** |
1138 |
* Returns the IANA encoding name that is auto-detected from |
1142 |
* Returns the IANA encoding name that is auto-detected from |
1139 |
* the bytes specified. |
1143 |
* the bytes specified, with the endian-ness of that encoding where appropriate. |
1140 |
* |
1144 |
* |
1141 |
* @param b4 The first four bytes of the input. |
1145 |
* @param b4 The first four bytes of the input. |
1142 |
* @param count The number of bytes actually read. |
1146 |
* @param count The number of bytes actually read. |
|
|
1147 |
* @return a 2-element array: the first element, an IANA-encoding string, |
1148 |
* the second element a Boolean which is true iff the document is big endian, false |
1149 |
* if it's little-endian, and null if the distinction isn't relevant. |
1143 |
*/ |
1150 |
*/ |
1144 |
protected String getEncodingName(byte[] b4, int count) { |
1151 |
protected Object[] getEncodingName(byte[] b4, int count) { |
1145 |
|
1152 |
|
1146 |
if (count < 2) { |
1153 |
if (count < 2) { |
1147 |
return "UTF-8"; |
1154 |
return new Object[]{"UTF-8", null}; |
1148 |
} |
1155 |
} |
1149 |
|
1156 |
|
1150 |
// UTF-16, with BOM |
1157 |
// UTF-16, with BOM |
Lines 1152-1223
Link Here
|
1152 |
int b1 = b4[1] & 0xFF; |
1159 |
int b1 = b4[1] & 0xFF; |
1153 |
if (b0 == 0xFE && b1 == 0xFF) { |
1160 |
if (b0 == 0xFE && b1 == 0xFF) { |
1154 |
// UTF-16, big-endian |
1161 |
// UTF-16, big-endian |
1155 |
return "UTF-16"; |
1162 |
return new Object [] {"UTF-16BE", new Boolean(true)}; |
1156 |
} |
1163 |
} |
1157 |
if (b0 == 0xFF && b1 == 0xFE) { |
1164 |
if (b0 == 0xFF && b1 == 0xFE) { |
1158 |
// UTF-16, little-endian |
1165 |
// UTF-16, little-endian |
1159 |
return "UTF-16"; |
1166 |
return new Object [] {"UTF-16LE", new Boolean(false)}; |
1160 |
} |
1167 |
} |
1161 |
|
1168 |
|
1162 |
// default to UTF-8 if we don't have enough bytes to make a |
1169 |
// default to UTF-8 if we don't have enough bytes to make a |
1163 |
// good determination of the encoding |
1170 |
// good determination of the encoding |
1164 |
if (count < 3) { |
1171 |
if (count < 3) { |
1165 |
return "UTF-8"; |
1172 |
return new Object [] {"UTF-8", null}; |
1166 |
} |
1173 |
} |
1167 |
|
1174 |
|
1168 |
// UTF-8 with a BOM |
1175 |
// UTF-8 with a BOM |
1169 |
int b2 = b4[2] & 0xFF; |
1176 |
int b2 = b4[2] & 0xFF; |
1170 |
if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { |
1177 |
if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { |
1171 |
return "UTF-8"; |
1178 |
return new Object [] {"UTF-8", null}; |
1172 |
} |
1179 |
} |
1173 |
|
1180 |
|
1174 |
// default to UTF-8 if we don't have enough bytes to make a |
1181 |
// default to UTF-8 if we don't have enough bytes to make a |
1175 |
// good determination of the encoding |
1182 |
// good determination of the encoding |
1176 |
if (count < 4) { |
1183 |
if (count < 4) { |
1177 |
return "UTF-8"; |
1184 |
return new Object [] {"UTF-8", null}; |
1178 |
} |
1185 |
} |
1179 |
|
1186 |
|
1180 |
// other encodings |
1187 |
// other encodings |
1181 |
int b3 = b4[3] & 0xFF; |
1188 |
int b3 = b4[3] & 0xFF; |
1182 |
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { |
1189 |
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { |
1183 |
// UCS-4, big endian (1234) |
1190 |
// UCS-4, big endian (1234) |
1184 |
// REVISIT: What should this be? |
1191 |
return new Object [] {"ISO-10646-UCS-4", new Boolean(true)}; |
1185 |
return "UnicodeBig"; |
|
|
1186 |
} |
1192 |
} |
1187 |
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { |
1193 |
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { |
1188 |
// UCS-4, little endian (4321) |
1194 |
// UCS-4, little endian (4321) |
1189 |
// REVISIT: What should this be? |
1195 |
return new Object [] {"ISO-10646-UCS-4", new Boolean(false)}; |
1190 |
return "UnicodeLittleUnmarked"; |
|
|
1191 |
} |
1196 |
} |
1192 |
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { |
1197 |
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { |
1193 |
// UCS-4, unusual octet order (2143) |
1198 |
// UCS-4, unusual octet order (2143) |
1194 |
// REVISIT: What should this be? |
1199 |
// REVISIT: What should this be? |
1195 |
return "UnicodeBigUnmarked"; |
1200 |
return new Object [] {"ISO-10646-UCS-4", null}; |
1196 |
} |
1201 |
} |
1197 |
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { |
1202 |
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { |
1198 |
// UCS-4, unusual octect order (3412) |
1203 |
// UCS-4, unusual octect order (3412) |
1199 |
// REVISIT: What should this be? |
1204 |
// REVISIT: What should this be? |
1200 |
return "UnicodeLittleUnmarked"; |
1205 |
return new Object [] {"ISO-10646-UCS-4", null}; |
1201 |
} |
1206 |
} |
1202 |
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { |
1207 |
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { |
1203 |
// UTF-16, big-endian, no BOM |
1208 |
// UTF-16, big-endian, no BOM |
|
|
1209 |
// (or could turn out to be UCS-2... |
1204 |
// REVISIT: What should this be? |
1210 |
// REVISIT: What should this be? |
1205 |
return "UnicodeBig"; |
1211 |
return new Object [] {"UTF-16BE", new Boolean(true)}; |
1206 |
} |
1212 |
} |
1207 |
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { |
1213 |
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { |
1208 |
// UTF-16, little-endian, no BOM |
1214 |
// UTF-16, little-endian, no BOM |
1209 |
return "UnicodeLittle"; |
1215 |
// (or could turn out to be UCS-2... |
|
|
1216 |
return new Object [] {"UTF-16LE", new Boolean(false)}; |
1210 |
} |
1217 |
} |
1211 |
if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { |
1218 |
if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { |
1212 |
// EBCDIC |
1219 |
// EBCDIC |
1213 |
// a la xerces1, return CP037 instead of EBCDIC here |
1220 |
// a la xerces1, return CP037 instead of EBCDIC here |
1214 |
return "CP037"; |
1221 |
return new Object [] {"CP037", null}; |
1215 |
} |
1222 |
} |
1216 |
|
1223 |
|
1217 |
// default encoding |
1224 |
// default encoding |
1218 |
return "UTF-8"; |
1225 |
return new Object [] {"UTF-8", null}; |
1219 |
|
1226 |
|
1220 |
} // getEncodingName(byte[],int):String |
1227 |
} // getEncodingName(byte[],int):Object[] |
1221 |
|
1228 |
|
1222 |
/** |
1229 |
/** |
1223 |
* Creates a reader capable of reading the given input stream in |
1230 |
* Creates a reader capable of reading the given input stream in |
Lines 1229-1238
Link Here
|
1229 |
* Java encoding names are allowed, then the |
1236 |
* Java encoding names are allowed, then the |
1230 |
* encoding name may be a Java encoding name; |
1237 |
* encoding name may be a Java encoding name; |
1231 |
* otherwise, it is an ianaEncoding name. |
1238 |
* otherwise, it is an ianaEncoding name. |
|
|
1239 |
* @param isBigEndian For encodings (like uCS-4), whose names cannot |
1240 |
* specify a byte order, this tells whether the order is bigEndian. null menas |
1241 |
* unknown or not relevant. |
1232 |
* |
1242 |
* |
1233 |
* @return Returns a reader. |
1243 |
* @return Returns a reader. |
1234 |
*/ |
1244 |
*/ |
1235 |
protected Reader createReader(InputStream inputStream, String encoding) |
1245 |
protected Reader createReader(InputStream inputStream, String encoding, Boolean isBigEndian) |
1236 |
throws IOException { |
1246 |
throws IOException { |
1237 |
|
1247 |
|
1238 |
// normalize encoding name |
1248 |
// normalize encoding name |
Lines 1254-1259
Link Here
|
1254 |
} |
1264 |
} |
1255 |
return new ASCIIReader(inputStream, fBufferSize); |
1265 |
return new ASCIIReader(inputStream, fBufferSize); |
1256 |
} |
1266 |
} |
|
|
1267 |
if(ENCODING.equals("ISO-10646-UCS-4")) { |
1268 |
if(isBigEndian != null) { |
1269 |
boolean isBE = isBigEndian.booleanValue(); |
1270 |
if(isBE) { |
1271 |
return new UCSReader(inputStream, UCSReader.UCS4BE); |
1272 |
} else { |
1273 |
return new UCSReader(inputStream, UCSReader.UCS4LE); |
1274 |
} |
1275 |
} else { |
1276 |
fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, |
1277 |
"EncodingByteOrderUnsupported", |
1278 |
new Object[] { encoding }, |
1279 |
XMLErrorReporter.SEVERITY_FATAL_ERROR); |
1280 |
} |
1281 |
} |
1282 |
if(ENCODING.equals("ISO-10646-UCS-2")) { |
1283 |
if(isBigEndian != null) { // sould never happen with this encoding... |
1284 |
boolean isBE = isBigEndian.booleanValue(); |
1285 |
if(isBE) { |
1286 |
return new UCSReader(inputStream, UCSReader.UCS2BE); |
1287 |
} else { |
1288 |
return new UCSReader(inputStream, UCSReader.UCS2LE); |
1289 |
} |
1290 |
} else { |
1291 |
fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, |
1292 |
"EncodingByteOrderUnsupported", |
1293 |
new Object[] { encoding }, |
1294 |
XMLErrorReporter.SEVERITY_FATAL_ERROR); |
1295 |
} |
1296 |
} |
1257 |
|
1297 |
|
1258 |
// check for valid name |
1298 |
// check for valid name |
1259 |
boolean validIANA = XMLChar.isValidIANAEncoding(encoding); |
1299 |
boolean validIANA = XMLChar.isValidIANAEncoding(encoding); |
Lines 1277-1283
Link Here
|
1277 |
// try to use a Java reader |
1317 |
// try to use a Java reader |
1278 |
String javaEncoding = EncodingMap.getIANA2JavaMapping(ENCODING); |
1318 |
String javaEncoding = EncodingMap.getIANA2JavaMapping(ENCODING); |
1279 |
if (javaEncoding == null) { |
1319 |
if (javaEncoding == null) { |
|
|
1320 |
if(fAllowJavaEncodings) { |
1280 |
javaEncoding = encoding; |
1321 |
javaEncoding = encoding; |
|
|
1322 |
} else { |
1323 |
fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN, |
1324 |
"EncodingDeclInvalid", |
1325 |
new Object[] { encoding }, |
1326 |
XMLErrorReporter.SEVERITY_FATAL_ERROR); |
1327 |
// see comment above. |
1328 |
javaEncoding = "ISO8859_1"; |
1329 |
} |
1281 |
} |
1330 |
} |
1282 |
if (DEBUG_ENCODINGS) { |
1331 |
if (DEBUG_ENCODINGS) { |
1283 |
System.out.print("$$$ creating Java InputStreamReader: encoding="+javaEncoding); |
1332 |
System.out.print("$$$ creating Java InputStreamReader: encoding="+javaEncoding); |
Lines 1288-1294
Link Here
|
1288 |
} |
1337 |
} |
1289 |
return new InputStreamReader(inputStream, javaEncoding); |
1338 |
return new InputStreamReader(inputStream, javaEncoding); |
1290 |
|
1339 |
|
1291 |
} // createReader(InputStream,String): |
1340 |
} // createReader(InputStream,String, Boolean): Reader |
1292 |
|
1341 |
|
1293 |
// |
1342 |
// |
1294 |
// Protected static methods |
1343 |
// Protected static methods |
Lines 1772-1777
Link Here
|
1772 |
// a single char! -Ac |
1821 |
// a single char! -Ac |
1773 |
if (fCurrentEntity.encoding == null || |
1822 |
if (fCurrentEntity.encoding == null || |
1774 |
!fCurrentEntity.encoding.equals(encoding)) { |
1823 |
!fCurrentEntity.encoding.equals(encoding)) { |
|
|
1824 |
// UTF-16 is a bit of a special case. If the encoding is UTF-16, |
1825 |
// and we know the endian-ness, we shouldn't change readers. |
1826 |
// If it's ISO-10646-UCS-(2|4), then we'll have to deduce |
1827 |
// the endian-ness from the encoding we presently have. |
1828 |
if(fCurrentEntity.encoding != null && fCurrentEntity.encoding.startsWith("UTF-16")) { |
1829 |
String ENCODING = encoding.toUpperCase(); |
1830 |
if(ENCODING.equals("UTF-16")) return; |
1831 |
if(ENCODING.equals("ISO-10646-UCS-4")) { |
1832 |
if(fCurrentEntity.encoding.equals("UTF-16BE")) { |
1833 |
fCurrentEntity.reader = new UCSReader(fCurrentEntity.stream, UCSReader.UCS4BE); |
1834 |
} else { |
1835 |
fCurrentEntity.reader = new UCSReader(fCurrentEntity.stream, UCSReader.UCS4LE); |
1836 |
} |
1837 |
return; |
1838 |
} |
1839 |
if(ENCODING.equals("ISO-10646-UCS-2")) { |
1840 |
if(fCurrentEntity.encoding.equals("UTF-16BE")) { |
1841 |
fCurrentEntity.reader = new UCSReader(fCurrentEntity.stream, UCSReader.UCS2BE); |
1842 |
} else { |
1843 |
fCurrentEntity.reader = new UCSReader(fCurrentEntity.stream, UCSReader.UCS2LE); |
1844 |
} |
1845 |
return; |
1846 |
} |
1847 |
} |
1775 |
// wrap a new reader around the input stream, changing |
1848 |
// wrap a new reader around the input stream, changing |
1776 |
// the encoding |
1849 |
// the encoding |
1777 |
if (DEBUG_ENCODINGS) { |
1850 |
if (DEBUG_ENCODINGS) { |
Lines 1779-1785
Link Here
|
1779 |
fCurrentEntity.stream); |
1852 |
fCurrentEntity.stream); |
1780 |
} |
1853 |
} |
1781 |
//fCurrentEntity.stream.reset(); |
1854 |
//fCurrentEntity.stream.reset(); |
1782 |
fCurrentEntity.reader = createReader(fCurrentEntity.stream, encoding); |
1855 |
fCurrentEntity.reader = createReader(fCurrentEntity.stream, encoding, null); |
1783 |
} else { |
1856 |
} else { |
1784 |
if (DEBUG_ENCODINGS) |
1857 |
if (DEBUG_ENCODINGS) |
1785 |
System.out.println("$$$ reusing old reader on stream"); |
1858 |
System.out.println("$$$ reusing old reader on stream"); |
Lines 3146-3155
Link Here
|
3146 |
} |
3219 |
} |
3147 |
|
3220 |
|
3148 |
public int read() throws IOException { |
3221 |
public int read() throws IOException { |
3149 |
int b; |
3222 |
int b = 0; |
3150 |
if (fOffset < fLength) { |
3223 |
if (fOffset < fLength) { |
3151 |
System.err.println("REturned buffered: " + (char)(fData[fOffset++] & 0xFF)); |
3224 |
return fData[fOffset++] & 0xff; |
3152 |
return fData[fOffset++] & 0xFF; |
|
|
3153 |
} |
3225 |
} |
3154 |
if (fOffset == fEndOffset) { |
3226 |
if (fOffset == fEndOffset) { |
3155 |
return -1; |
3227 |
return -1; |
Lines 3166-3172
Link Here
|
3166 |
} |
3238 |
} |
3167 |
fData[fLength++] = (byte)b; |
3239 |
fData[fLength++] = (byte)b; |
3168 |
fOffset++; |
3240 |
fOffset++; |
3169 |
return b; |
3241 |
return b & 0xff; |
3170 |
} |
3242 |
} |
3171 |
|
3243 |
|
3172 |
public int read(byte[] b, int off, int len) throws IOException { |
3244 |
public int read(byte[] b, int off, int len) throws IOException { |
Lines 3179-3191
Link Here
|
3179 |
if(fCurrentEntity.mayReadChunks) { |
3251 |
if(fCurrentEntity.mayReadChunks) { |
3180 |
return fInputStream.read(b, off, len); |
3252 |
return fInputStream.read(b, off, len); |
3181 |
} |
3253 |
} |
3182 |
b[off] = (byte)read(); |
3254 |
int returnedVal = read(); |
3183 |
if(b[off] == -1) { |
3255 |
if(returnedVal == -1) { |
3184 |
fEndOffset = fOffset; |
3256 |
fEndOffset = fOffset; |
3185 |
return -1; |
3257 |
return -1; |
3186 |
} |
3258 |
} |
3187 |
byte [] c = new byte[off]; |
3259 |
b[off] = (byte)returnedVal; |
3188 |
System.arraycopy(b,0,c,0,off); |
|
|
3189 |
return 1; |
3260 |
return 1; |
3190 |
} |
3261 |
} |
3191 |
if (len < bytesLeft) { |
3262 |
if (len < bytesLeft) { |