001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import java.io.BufferedInputStream;
020import java.io.BufferedReader;
021import java.io.File;
022import java.io.IOException;
023import java.io.InputStream;
024import java.io.InputStreamReader;
025import java.io.Reader;
026import java.io.StringReader;
027import java.net.HttpURLConnection;
028import java.net.URL;
029import java.net.URLConnection;
030import java.nio.charset.Charset;
031import java.nio.charset.StandardCharsets;
032import java.nio.file.Files;
033import java.nio.file.Path;
034import java.text.MessageFormat;
035import java.util.Locale;
036import java.util.Objects;
037import java.util.regex.Matcher;
038import java.util.regex.Pattern;
039
040import org.apache.commons.io.ByteOrderMark;
041import org.apache.commons.io.Charsets;
042import org.apache.commons.io.IOUtils;
043import org.apache.commons.io.build.AbstractStreamBuilder;
044import org.apache.commons.io.function.IOConsumer;
045
046/**
047 * Character stream that handles all the necessary Voodoo to figure out the charset encoding of the XML document within the stream.
048 * <p>
049 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
050 * </p>
051 * <p>
052 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100%
053 * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlStreamReader handles it and things work in all parsers).
054 * </p>
055 * <p>
056 * The XmlStreamReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors.
057 * </p>
058 * <p>
059 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for a script (following HTTP MIME and XML
060 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="https://diveintomark.org/archives/2004/02/13/xml-media-types">
061 * Determining the character encoding of a feed</a>.
062 * </p>
063 * <p>
064 * To build an instance, use {@link Builder}.
065 * </p>
066 * <p>
067 * Originally developed for <a href="https://rome.dev.java.net">ROME</a> under Apache License 2.0.
068 * </p>
069 *
070 * @see Builder
071 * @see org.apache.commons.io.output.XmlStreamWriter
072 * @since 2.0
073 */
074public class XmlStreamReader extends Reader {
075
076    // @formatter:off
077    /**
078     * Builds a new {@link XmlStreamReader}.
079     *
080     * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
081     * <p>
082     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
083     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
084     * </p>
085     * <p>
086     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
087     * </p>
088     * <p>
089     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
090     * </p>
091     * <p>
092     * Else if the XML prolog had a charset encoding that encoding is used.
093     * </p>
094     * <p>
095     * Else if the content type had a charset encoding that encoding is used.
096     * </p>
097     * <p>
098     * Else 'UTF-8' is used.
099     * </p>
100     * <p>
101     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
102     * </p>
103     * <p>
104     * For example:
105     * </p>
106     *
107     * <pre>{@code
108     * XmlStreamReader r = XmlStreamReader.builder()
109     *   .setPath(path)
110     *   .setCharset(StandardCharsets.UTF_8)
111     *   .get();
112     * }
113     * </pre>
114     *
115     * @see #get()
116     * @since 2.12.0
117     */
118    // @formatter:on
119    public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> {
120
121        private boolean nullCharset = true;
122        private boolean lenient = true;
123        private String httpContentType;
124
125        /**
126         * Constructs a new builder of {@link XmlStreamReader}.
127         */
128        public Builder() {
129            // empty
130        }
131
132        /**
133         * Builds a new {@link XmlStreamReader}.
134         * <p>
135         * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
136         * </p>
137         * <p>
138         * This builder uses the following aspects:
139         * </p>
140         * <ul>
141         * <li>{@link #getInputStream()}</li>
142         * <li>{@link #getCharset()}</li>
143         * <li>lenient</li>
144         * <li>httpContentType</li>
145         * </ul>
146         *
147         * @return a new instance.
148         * @throws IllegalStateException         if the {@code origin} is {@code null}.
149         * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
150         * @throws IOException                   if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
151         * @throws XmlStreamReaderException thrown if the Charset encoding could not be determined according to the specification.
152         * @see #getInputStream()
153         * @see #getUnchecked()
154         */
155        @Override
156        public XmlStreamReader get() throws IOException {
157            final String defaultEncoding = nullCharset ? null : getCharset().name();
158            // @formatter:off
159            return httpContentType == null
160                    ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding)
161                    : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding);
162            // @formatter:on
163        }
164
165        @Override
166        public Builder setCharset(final Charset charset) {
167            nullCharset = charset == null;
168            return super.setCharset(charset);
169        }
170
171        @Override
172        public Builder setCharset(final String charset) {
173            nullCharset = charset == null;
174            return super.setCharset(Charsets.toCharset(charset, getCharsetDefault()));
175        }
176
177        /**
178         * Sets the HTTP content type.
179         *
180         * @param httpContentType the HTTP content type.
181         * @return {@code this} instance.
182         */
183        public Builder setHttpContentType(final String httpContentType) {
184            this.httpContentType = httpContentType;
185            return this;
186        }
187
188        /**
189         * Sets the lenient toggle.
190         *
191         * @param lenient the lenient toggle.
192         * @return {@code this} instance.
193         */
194        public Builder setLenient(final boolean lenient) {
195            this.lenient = lenient;
196            return this;
197        }
198
199    }
200
201    private static final String UTF_8 = StandardCharsets.UTF_8.name();
202
203    private static final String US_ASCII = StandardCharsets.US_ASCII.name();
204
205    private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();
206
207    private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();
208
209    private static final String UTF_32BE = "UTF-32BE";
210
211    private static final String UTF_32LE = "UTF-32LE";
212
213    private static final String UTF_16 = StandardCharsets.UTF_16.name();
214
215    private static final String UTF_32 = "UTF-32";
216
217    private static final String EBCDIC = "CP1047";
218
219    private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
220            ByteOrderMark.UTF_32LE };
221
222    /** UTF_16LE and UTF_32LE have the same two starting BOM bytes. */
223    private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
224            new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
225            new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
226            new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
227            new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) };
228
229    private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?");
230
231    /**
232     * Pattern capturing the encoding of the <a href="https://www.w3.org/TR/REC-xml/#sec-pi">{@code 'xml'} processing instruction</a>.
233     * <p>
234     * See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">NT-EncName</a> XML specification.
235     * </p>
236     * <p>
237     * Note the documented pattern is:
238     * </p>
239     * <pre>
240     * EncName   ::=   [A-Za-z] ([A-Za-z0-9._] | '-')*
241     * </pre>
242     * <p>
243     * However this does not match all the aliases that are supported by Java. For example, {@code '437'}, {@code 'ISO_8859-1:1987'} and
244     * {@code 'ebcdic-de-273+euro'}.
245     * </p>
246     */
247    public static final Pattern ENCODING_PATTERN = Pattern.compile(
248    // @formatter:off
249            "^<\\?xml\\s+"
250            + "(?:version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+)??"
251            + "encoding\\s*=\\s*"
252            + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")"  // double-quoted
253            +  "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted
254            Pattern.MULTILINE);
255    // @formatter:on
256
257    private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
258
259    private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
260
261    private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null";
262
263    private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
264
265    private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME";
266
267    /**
268     * Constructs a new {@link Builder}.
269     *
270     * @return a new {@link Builder}.
271     * @since 2.12.0
272     */
273    public static Builder builder() {
274        return new Builder();
275    }
276
277    /**
278     * Gets the charset parameter value, {@code null} if not present, {@code null} if httpContentType is {@code null}.
279     *
280     * @param httpContentType the HTTP content type.
281     * @return The content type encoding (upcased).
282     */
283    static String getContentTypeEncoding(final String httpContentType) {
284        String encoding = null;
285        if (httpContentType != null) {
286            final int i = httpContentType.indexOf(";");
287            if (i > -1) {
288                final String postMime = httpContentType.substring(i + 1);
289                final Matcher m = CHARSET_PATTERN.matcher(postMime);
290                encoding = m.find() ? m.group(1) : null;
291                encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
292            }
293        }
294        return encoding;
295    }
296
297    /**
298     * Gets the MIME type or {@code null} if httpContentType is {@code null}.
299     *
300     * @param httpContentType the HTTP content type.
301     * @return The mime content type.
302     */
303    static String getContentTypeMime(final String httpContentType) {
304        String mime = null;
305        if (httpContentType != null) {
306            final int i = httpContentType.indexOf(";");
307            mime = i >= 0 ? httpContentType.substring(0, i) : httpContentType;
308            mime = mime.trim();
309        }
310        return mime;
311    }
312
313    /**
314     * Gets the encoding declared in the <?xml encoding=...?>, {@code null} if none.
315     *
316     * @param inputStream InputStream to create the reader from.
317     * @param guessedEnc  guessed encoding.
318     * @return the encoding declared in the <?xml encoding=...?>.
319     * @throws IOException thrown if there is a problem reading the stream.
320     */
321    private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException {
322        String encoding = null;
323        if (guessedEnc != null) {
324            final byte[] bytes = IOUtils.byteArray();
325            inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE);
326            int offset = 0;
327            int max = IOUtils.DEFAULT_BUFFER_SIZE;
328            int c = inputStream.read(bytes, offset, max);
329            int firstGT = -1;
330            String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
331            while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
332                offset += c;
333                max -= c;
334                c = inputStream.read(bytes, offset, max);
335                xmlProlog = new String(bytes, 0, offset, guessedEnc);
336                firstGT = xmlProlog.indexOf('>');
337            }
338            if (firstGT == -1) {
339                if (c == -1) {
340                    throw new IOException("Unexpected end of XML stream");
341                }
342                throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
343            }
344            final int bytesRead = offset;
345            if (bytesRead > 0) {
346                inputStream.reset();
347                final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
348                final StringBuilder prolog = new StringBuilder();
349                IOConsumer.forEach(bReader.lines(), l -> prolog.append(l).append(' '));
350                final Matcher m = ENCODING_PATTERN.matcher(prolog);
351                if (m.find()) {
352                    encoding = m.group(1).toUpperCase(Locale.ROOT);
353                    encoding = encoding.substring(1, encoding.length() - 1);
354                }
355            }
356        }
357        return encoding;
358    }
359
360    /**
361     * Tests if the MIME type belongs to the APPLICATION XML family.
362     *
363     * @param mime The mime type.
364     * @return true if the mime type belongs to the APPLICATION XML family, otherwise false.
365     */
366    static boolean isAppXml(final String mime) {
367        return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity")
368                || mime.startsWith("application/") && mime.endsWith("+xml"));
369    }
370
371    /**
372     * Tests if the MIME type belongs to the TEXT XML family.
373     *
374     * @param mime The mime type.
375     * @return true if the mime type belongs to the TEXT XML family, otherwise false.
376     */
377    static boolean isTextXml(final String mime) {
378        return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml"));
379    }
380
381    private final Reader reader;
382
383    private final String encoding;
384
385    private final String defaultEncoding;
386
387    /**
388     * Constructs a Reader for a File.
389     * <p>
390     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
391     * </p>
392     * <p>
393     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
394     * </p>
395     *
396     * @param file File to create a Reader from.
397     * @throws NullPointerException if the input is {@code null}.
398     * @throws IOException          thrown if there is a problem reading the file.
399     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
400     */
401    @Deprecated
402    public XmlStreamReader(final File file) throws IOException {
403        this(Objects.requireNonNull(file, "file").toPath());
404    }
405
406    /**
407     * Constructs a Reader for a raw InputStream.
408     * <p>
409     * It follows the same logic used for files.
410     * </p>
411     * <p>
412     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
413     * </p>
414     *
415     * @param inputStream InputStream to create a Reader from.
416     * @throws NullPointerException if the input stream is {@code null}.
417     * @throws IOException          thrown if there is a problem reading the stream.
418     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
419     */
420    @Deprecated
421    public XmlStreamReader(final InputStream inputStream) throws IOException {
422        this(inputStream, true);
423    }
424
425    /**
426     * Constructs a Reader for a raw InputStream.
427     * <p>
428     * It follows the same logic used for files.
429     * </p>
430     * <p>
431     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
432     * </p>
433     * <p>
434     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
435     * </p>
436     * <p>
437     * Else if the XML prolog had a charset encoding that encoding is used.
438     * </p>
439     * <p>
440     * Else if the content type had a charset encoding that encoding is used.
441     * </p>
442     * <p>
443     * Else 'UTF-8' is used.
444     * </p>
445     * <p>
446     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
447     * </p>
448     *
449     * @param inputStream InputStream to create a Reader from.
450     * @param lenient     indicates if the charset encoding detection should be relaxed.
451     * @throws NullPointerException     if the input stream is {@code null}.
452     * @throws IOException              thrown if there is a problem reading the stream.
453     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
454     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
455     */
456    @Deprecated
457    public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException {
458        this(inputStream, lenient, null);
459    }
460
461    /**
462     * Constructs a Reader for a raw InputStream.
463     * <p>
464     * It follows the same logic used for files.
465     * </p>
466     * <p>
467     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
468     * </p>
469     * <p>
470     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
471     * </p>
472     * <p>
473     * Else if the XML prolog had a charset encoding that encoding is used.
474     * </p>
475     * <p>
476     * Else if the content type had a charset encoding that encoding is used.
477     * </p>
478     * <p>
479     * Else 'UTF-8' is used.
480     * </p>
481     * <p>
482     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
483     * </p>
484     *
485     * @param inputStream     InputStream to create a Reader from.
486     * @param lenient         indicates if the charset encoding detection should be relaxed.
487     * @param defaultEncoding The default encoding.
488     * @throws NullPointerException     if the input stream is {@code null}.
489     * @throws IOException              thrown if there is a problem reading the stream.
490     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
491     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
492     */
493    @Deprecated
494    @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
495    public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException {
496        this.defaultEncoding = defaultEncoding;
497        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
498                false, BOMS);
499        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
500        this.encoding = toEncoding(bom, pis, lenient);
501        this.reader = new InputStreamReader(pis, encoding);
502    }
503
504    /**
505     * Constructs a Reader using an InputStream and the associated content-type header.
506     * <p>
507     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
508     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
509     * </p>
510     * <p>
511     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
512     * </p>
513     *
514     * @param inputStream     InputStream to create the reader from.
515     * @param httpContentType content-type header to use for the resolution of the charset encoding.
516     * @throws NullPointerException if the input stream is {@code null}.
517     * @throws IOException          thrown if there is a problem reading the file.
518     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
519     */
520    @Deprecated
521    public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException {
522        this(inputStream, httpContentType, true);
523    }
524
525    /**
526     * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
527     * <p>
528     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
529     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
530     * </p>
531     * <p>
532     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
533     * </p>
534     * <p>
535     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
536     * </p>
537     * <p>
538     * Else if the XML prolog had a charset encoding that encoding is used.
539     * </p>
540     * <p>
541     * Else if the content type had a charset encoding that encoding is used.
542     * </p>
543     * <p>
544     * Else 'UTF-8' is used.
545     * </p>
546     * <p>
547     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
548     * </p>
549     *
550     * @param inputStream     InputStream to create the reader from.
551     * @param httpContentType content-type header to use for the resolution of the charset encoding.
552     * @param lenient         indicates if the charset encoding detection should be relaxed.
553     * @throws NullPointerException     if the input stream is {@code null}.
554     * @throws IOException              thrown if there is a problem reading the file.
555     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
556     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
557     */
558    @Deprecated
559    public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException {
560        this(inputStream, httpContentType, lenient, null);
561    }
562
563    /**
564     * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
565     * <p>
566     * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
567     * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
568     * </p>
569     * <p>
570     * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
571     * </p>
572     * <p>
573     * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
574     * </p>
575     * <p>
576     * Else if the XML prolog had a charset encoding that encoding is used.
577     * </p>
578     * <p>
579     * Else if the content type had a charset encoding that encoding is used.
580     * </p>
581     * <p>
582     * Else 'UTF-8' is used.
583     * </p>
584     * <p>
585     * If lenient detection is indicated an XmlStreamReaderException is never thrown.
586     * </p>
587     *
588     * @param inputStream     InputStream to create the reader from.
589     * @param httpContentType content-type header to use for the resolution of the charset encoding.
590     * @param lenient         indicates if the charset encoding detection should be relaxed.
591     * @param defaultEncoding The default encoding.
592     * @throws NullPointerException     if the input stream is {@code null}.
593     * @throws IOException              thrown if there is a problem reading the file.
594     * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
595     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
596     */
597    @Deprecated
598    @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
599    public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding)
600            throws IOException {
601        this.defaultEncoding = defaultEncoding;
602        final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
603                false, BOMS);
604        final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
605        this.encoding = toEncoding(bom, pis, lenient, httpContentType);
606        this.reader = new InputStreamReader(pis, encoding);
607    }
608
609    /**
610     * Constructs a Reader for a File.
611     * <p>
612     * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
613     * </p>
614     * <p>
615     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
616     * </p>
617     *
618     * @param file File to create a Reader from.
619     * @throws NullPointerException if the input is {@code null}.
620     * @throws IOException          thrown if there is a problem reading the file.
621     * @since 2.11.0
622     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
623     */
624    @Deprecated
625    @SuppressWarnings("resource") // InputStream is managed through another reader in this instance.
626    public XmlStreamReader(final Path file) throws IOException {
627        this(Files.newInputStream(Objects.requireNonNull(file, "file")));
628    }
629
630    /**
631     * Constructs a Reader using the InputStream of a URL.
632     * <p>
633     * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files.
634     * </p>
635     * <p>
636     * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type.
637     * </p>
638     * <p>
639     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
640     * </p>
641     *
642     * @param url URL to create a Reader from.
643     * @throws NullPointerException if the input is {@code null}.
644     * @throws IOException          thrown if there is a problem reading the stream of the URL.
645     */
646    public XmlStreamReader(final URL url) throws IOException {
647        this(Objects.requireNonNull(url, "url").openConnection(), null);
648    }
649
650    /**
651     * Constructs a Reader using the InputStream of a URLConnection.
652     * <p>
653     * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files.
654     * </p>
655     * <p>
656     * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with
657     * content-type.
658     * </p>
659     * <p>
660     * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
661     * </p>
662     *
663     * @param urlConnection   URLConnection to create a Reader from.
664     * @param defaultEncoding The default encoding.
665     * @throws NullPointerException if the input is {@code null}.
666     * @throws IOException          thrown if there is a problem reading the stream of the URLConnection.
667     */
668    public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException {
669        Objects.requireNonNull(urlConnection, "urlConnection");
670        this.defaultEncoding = defaultEncoding;
671        final boolean lenient = true;
672        final String contentType = urlConnection.getContentType();
673        final InputStream inputStream = urlConnection.getInputStream();
674        @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance
675        // @formatter:off
676        final BOMInputStream bomInput = BOMInputStream.builder()
677            .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE))
678            .setInclude(false)
679            .setByteOrderMarks(BOMS)
680            .get();
681        @SuppressWarnings("resource")
682        final BOMInputStream piInput = BOMInputStream.builder()
683            .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE))
684            .setInclude(true)
685            .setByteOrderMarks(XML_GUESS_BYTES)
686            .get();
687        // @formatter:on
688        if (urlConnection instanceof HttpURLConnection || contentType != null) {
689            this.encoding = toEncoding(bomInput, piInput, lenient, contentType);
690        } else {
691            this.encoding = toEncoding(bomInput, piInput, lenient);
692        }
693        this.reader = new InputStreamReader(piInput, encoding);
694    }
695
696    /**
697     * Closes the XmlStreamReader stream.
698     *
699     * @throws IOException thrown if there was a problem closing the stream.
700     */
701    @Override
702    public void close() throws IOException {
703        reader.close();
704    }
705
706    /**
707     * Gets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate.
708     * <p>
709     * If it is {@code null} the content-type based rules are used.
710     * </p>
711     *
712     * @return the default encoding to use.
713     */
714    public String getDefaultEncoding() {
715        return defaultEncoding;
716    }
717
718    /**
719     * Gets the charset encoding of the XmlStreamReader.
720     *
721     * @return charset encoding.
722     */
723    public String getEncoding() {
724        return encoding;
725    }
726
727    /**
728     * Reads the underlying reader's {@code read(char[], int, int)} method.
729     *
730     * @param buf    the buffer to read the characters into.
731     * @param offset The start offset.
732     * @param len    The number of bytes to read.
733     * @return the number of characters read or -1 if the end of stream.
734     * @throws IOException if an I/O error occurs.
735     */
736    @Override
737    public int read(final char[] buf, final int offset, final int len) throws IOException {
738        return reader.read(buf, offset, len);
739    }
740
741    /**
742     * Process the raw stream.
743     *
744     * @param bomInput     BOMInputStream to detect byte order marks.
745     * @param piInput     BOMInputStream to guess XML encoding.
746     * @param lenient indicates if the charset encoding detection should be relaxed.
747     * @return the encoding to be used.
748     * @throws IOException thrown if there is a problem reading the stream.
749     */
750    private String toEncoding(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException {
751        final String bomEnc = bomInput.getBOMCharsetName();
752        final String xmlGuessEnc = piInput.getBOMCharsetName();
753        final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
754        try {
755            return toRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
756        } catch (final XmlStreamReaderException ex) {
757            if (lenient) {
758                return toEncodingLenient(null, ex);
759            }
760            throw ex;
761        }
762    }
763
764    /**
765     * Processes an HTTP stream.
766     *
767     * @param bomInput        BOMInputStream to detect byte order marks.
768     * @param piInput         BOMInputStream to guess XML encoding.
769     * @param lenient         indicates if the charset encoding detection should be relaxed.
770     * @param httpContentType The HTTP content type.
771     * @return the encoding to be used.
772     * @throws IOException thrown if there is a problem reading the stream.
773     */
774    private String toEncoding(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType)
775            throws IOException {
776        final String bomEnc = bomInput.getBOMCharsetName();
777        final String xmlGuessEnc = piInput.getBOMCharsetName();
778        final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
779        try {
780            return toHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType);
781        } catch (final XmlStreamReaderException ex) {
782            if (lenient) {
783                return toEncodingLenient(httpContentType, ex);
784            }
785            throw ex;
786        }
787    }
788
789    /**
790     * Detects the encoding in lenient mode.
791     *
792     * @param httpContentType content-type header to use for the resolution of the charset encoding.
793     * @param ex              The thrown exception.
794     * @return the encoding.
795     * @throws IOException thrown if there is a problem reading the stream.
796     */
797    private String toEncodingLenient(String httpContentType, XmlStreamReaderException ex) throws IOException {
798        if (httpContentType != null && httpContentType.startsWith("text/html")) {
799            httpContentType = httpContentType.substring("text/html".length());
800            httpContentType = "text/xml" + httpContentType;
801            try {
802                return toHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType);
803            } catch (final XmlStreamReaderException ex2) {
804                ex = ex2;
805            }
806        }
807        String encoding = ex.getXmlEncoding();
808        if (encoding == null) {
809            encoding = ex.getContentTypeEncoding();
810        }
811        if (encoding == null) {
812            encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
813        }
814        return encoding;
815    }
816
817    /**
818     * Calculates the HTTP encoding.
819     *
820     * @param bomEnc          BOM encoding.
821     * @param xmlGuessEnc     XML Guess encoding.
822     * @param xmlEnc          XML encoding.
823     * @param lenient         indicates if the charset encoding detection should be relaxed.
824     * @param httpContentType The HTTP content type.
825     * @return the HTTP encoding.
826     * @throws IOException thrown if there is a problem reading the stream.
827     */
828    String toHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType)
829            throws IOException {
830        // Lenient and has XML encoding
831        if (lenient && xmlEnc != null) {
832            return xmlEnc;
833        }
834        // Determine mime/encoding content types from HTTP Content Type
835        final String cTMime = getContentTypeMime(httpContentType);
836        final String cTEnc = getContentTypeEncoding(httpContentType);
837        final boolean appXml = isAppXml(cTMime);
838        final boolean textXml = isTextXml(cTMime);
839        // Mime type NOT "application/xml" or "text/xml"
840        if (!appXml && !textXml) {
841            final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
842            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
843        }
844        // No content type encoding
845        if (cTEnc == null) {
846            if (appXml) {
847                return toRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
848            }
849            return defaultEncoding == null ? US_ASCII : defaultEncoding;
850        }
851        // UTF-16BE or UTF-16LE content type encoding
852        if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
853            if (bomEnc != null) {
854                final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
855                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
856            }
857            return cTEnc;
858        }
859        // UTF-16 content type encoding
860        if (cTEnc.equals(UTF_16)) {
861            if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
862                return bomEnc;
863            }
864            final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
865            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
866        }
867        // UTF-32BE or UTF-132E content type encoding
868        if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
869            if (bomEnc != null) {
870                final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
871                throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
872            }
873            return cTEnc;
874        }
875        // UTF-32 content type encoding
876        if (cTEnc.equals(UTF_32)) {
877            if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
878                return bomEnc;
879            }
880            final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
881            throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
882        }
883        return cTEnc;
884    }
885
886    /**
887     * Calculate the raw encoding.
888     *
889     * @param bomEnc      BOM encoding.
890     * @param xmlGuessEnc XML Guess encoding.
891     * @param xmlEnc      XML encoding.
892     * @return the raw encoding.
893     * @throws IOException thrown if there is a problem reading the stream.
894     */
895    String toRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException {
896
897        // BOM is Null
898        if (bomEnc == null) {
899            if (xmlGuessEnc == null || xmlEnc == null) {
900                return defaultEncoding == null ? UTF_8 : defaultEncoding;
901            }
902            if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
903                return xmlGuessEnc;
904            }
905            return xmlEnc;
906        }
907
908        // BOM is UTF-8
909        if (bomEnc.equals(UTF_8)) {
910            if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8) || xmlEnc != null && !xmlEnc.equals(UTF_8)) {
911                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
912                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
913            }
914            return bomEnc;
915        }
916
917        // BOM is UTF-16BE or UTF-16LE
918        if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
919            if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
920                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
921                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
922            }
923            if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
924                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
925                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
926            }
927            return bomEnc;
928        }
929
930        // BOM is UTF-32BE or UTF-32LE
931        if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
932            if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
933                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
934                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
935            }
936            if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
937                final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
938                throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
939            }
940            return bomEnc;
941        }
942
943        // BOM is something else
944        final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
945        throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
946    }
947
948}