001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.io.input;
019
020import static org.apache.commons.io.IOUtils.EOF;
021
022import java.io.IOException;
023import java.io.InputStream;
024import java.util.Arrays;
025import java.util.Comparator;
026import java.util.List;
027import java.util.Objects;
028
029import org.apache.commons.io.ByteOrderMark;
030import org.apache.commons.io.IOUtils;
031
032/**
033 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
034 * <p>
035 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the first byte in the stream.
036 * </p>
037 * <p>
038 * The {@link ByteOrderMark} implementation has the following predefined BOMs:
039 * </p>
040 * <ul>
041 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
042 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
043 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
044 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
045 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
046 * </ul>
047 * <p>
048 * To build an instance, use {@link Builder}.
049 * </p>
050 * <h2>Example 1 - Detecting and excluding a UTF-8 BOM</h2>
051 *
052 * <pre>
053 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).get();
054 * if (bomIn.hasBOM()) {
055 *     // has a UTF-8 BOM
056 * }
057 * </pre>
058 *
059 * <h2>Example 2 - Detecting a UTF-8 BOM without excluding it</h2>
060 *
061 * <pre>
062 * boolean include = true;
063 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in).setInclude(include).get();
064 * if (bomIn.hasBOM()) {
065 *     // has a UTF-8 BOM
066 * }
067 * </pre>
068 *
069 * <h2>Example 3 - Detecting Multiple BOMs</h2>
070 *
071 * <pre>
072 * BOMInputStream bomIn = BOMInputStream.builder().setInputStream(in)
073 *         .setByteOrderMarks(ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE).get();
074 * if (bomIn.hasBOM() == false) {
075 *     // No BOM found
076 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
077 *     // has a UTF-16LE BOM
078 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
079 *     // has a UTF-16BE BOM
080 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
081 *     // has a UTF-32LE BOM
082 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
083 *     // has a UTF-32BE BOM
084 * }
085 * </pre>
086 * <p>
087 * To build an instance, use {@link Builder}.
088 * </p>
089 * <p>
090 * This class is not thread-safe.
091 * </p>
092 *
093 * @see Builder
094 * @see org.apache.commons.io.ByteOrderMark
095 * @see <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
096 * @since 2.0
097 */
098public class BOMInputStream extends ProxyInputStream {
099
100    // @formatter:off
101    /**
102     * Builds a new {@link BOMInputStream}.
103     *
104     * <h2>Using NIO</h2>
105     * <pre>{@code
106     * BOMInputStream s = BOMInputStream.builder()
107     *   .setPath(Paths.get("MyFile.xml"))
108     *   .setByteOrderMarks(ByteOrderMark.UTF_8)
109     *   .setInclude(false)
110     *   .get();}
111     * </pre>
112     * <h2>Using IO</h2>
113     * <pre>{@code
114     * BOMInputStream s = BOMInputStream.builder()
115     *   .setFile(new File("MyFile.xml"))
116     *   .setByteOrderMarks(ByteOrderMark.UTF_8)
117     *   .setInclude(false)
118     *   .get();}
119     * </pre>
120     *
121     * @see #get()
122     * @since 2.12.0
123     */
124    // @formatter:on
125    public static class Builder extends AbstractBuilder<BOMInputStream, Builder> {
126
127        private static final ByteOrderMark[] DEFAULT = { ByteOrderMark.UTF_8 };
128
129        /**
130         * For test access.
131         *
132         * @return the default byte order mark.
133         */
134        static ByteOrderMark getDefaultByteOrderMark() {
135            return DEFAULT[0];
136        }
137
138        private ByteOrderMark[] byteOrderMarks = DEFAULT;
139        private boolean include;
140
141        /**
142         * Constructs a new builder of {@link BOMInputStream}.
143         */
144        public Builder() {
145            // empty
146        }
147
148        /**
149         * Builds a new {@link BOMInputStream}.
150         * <p>
151         * You must set an aspect that supports {@link #getInputStream()}, otherwise, this method throws an exception.
152         * </p>
153         * <p>
154         * This builder uses the following aspects: InputStream, OpenOption[], include, and ByteOrderMark[].
155         * </p>
156         * <p>
157         * This builder uses the following aspects:
158         * </p>
159         * <ul>
160         * <li>{@link #getInputStream()}</li>
161         * <li>include}</li>
162         * <li>byteOrderMarks</li>
163         * </ul>
164         *
165         * @return a new instance.
166         * @throws IllegalStateException         if the {@code origin} is {@code null}.
167         * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
168         * @throws IOException                   if an I/O error occurs converting to an {@link InputStream} using {@link #getInputStream()}.
169         * @see #getInputStream()
170         * @see #getUnchecked()
171         */
172        @Override
173        public BOMInputStream get() throws IOException {
174            return new BOMInputStream(this);
175        }
176
177        /**
178         * Sets the ByteOrderMarks to detect and optionally exclude.
179         * <p>
180         * The default is {@link ByteOrderMark#UTF_8}.
181         * </p>
182         *
183         * @param byteOrderMarks the ByteOrderMarks to detect and optionally exclude.
184         * @return {@code this} instance.
185         */
186        public Builder setByteOrderMarks(final ByteOrderMark... byteOrderMarks) {
187            this.byteOrderMarks = byteOrderMarks != null ? byteOrderMarks.clone() : DEFAULT;
188            return this;
189        }
190
191        /**
192         * Sets whether to include the UTF-8 BOM (true) or to exclude it (false).
193         * <p>
194         * The default is false.
195         * </p>
196         *
197         * @param include true to include the UTF-8 BOM or false to exclude it. return this;.
198         * @return {@code this} instance.
199         */
200        public Builder setInclude(final boolean include) {
201            this.include = include;
202            return this;
203        }
204    }
205
206    /**
207     * Compares ByteOrderMark objects in descending length order.
208     */
209    private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = Comparator.comparing(ByteOrderMark::length).reversed();
210
211    /**
212     * Constructs a new {@link Builder}.
213     *
214     * @return a new {@link Builder}.
215     * @since 2.12.0
216     */
217    public static Builder builder() {
218        return new Builder();
219    }
220
221    /**
222     * BOMs are sorted from longest to shortest.
223     */
224    private final List<ByteOrderMark> bomList;
225    private final ByteOrderMark byteOrderMark;
226    private int fbIndex;
227    private int[] firstBytes;
228    private final boolean include;
229    private boolean markedAtStart;
230    private int markFbIndex;
231
232    /**
233     * Constructs a new instance.
234     *
235     * @param builder The builder.
236     * @throws IOException if an error reading the first bytes of the stream occurs.
237     */
238    private BOMInputStream(final Builder builder) throws IOException {
239        super(builder);
240        if (IOUtils.length(builder.byteOrderMarks) == 0) {
241            throw new IllegalArgumentException("No ByteOrderMark specified.");
242        }
243        this.include = builder.include;
244        final List<ByteOrderMark> bomList = Arrays.asList(builder.byteOrderMarks);
245        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
246        bomList.sort(ByteOrderMarkLengthComparator);
247        this.bomList = bomList;
248        this.byteOrderMark = readBom();
249    }
250
251    /**
252     * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
253     *
254     * @param delegate the InputStream to delegate to.
255     * @throws IOException if an error reading the first bytes of the stream occurs.
256     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}.
257     */
258    @Deprecated
259    public BOMInputStream(final InputStream delegate) throws IOException {
260        this(delegate, false, Builder.DEFAULT);
261    }
262
263    /**
264     * Constructs a new BOM InputStream that detects a {@link ByteOrderMark#UTF_8} and optionally includes it.
265     *
266     * @param delegate the InputStream to delegate to.
267     * @param include  true to include the UTF-8 BOM or false to exclude it.
268     * @throws IOException if an error reading the first bytes of the stream occurs.
269     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}.
270     */
271    @Deprecated
272    public BOMInputStream(final InputStream delegate, final boolean include) throws IOException {
273        this(delegate, include, Builder.DEFAULT);
274    }
275
276    /**
277     * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
278     *
279     * @param delegate the InputStream to delegate to.
280     * @param include  true to include the specified BOMs or false to exclude them.
281     * @param boms     The BOMs to detect and optionally exclude.
282     * @throws IOException if an error reading the first bytes of the stream occurs.
283     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}.
284     */
285    @Deprecated
286    public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) throws IOException {
287        super(delegate);
288        if (IOUtils.length(boms) == 0) {
289            throw new IllegalArgumentException("No BOMs specified");
290        }
291        this.include = include;
292        final List<ByteOrderMark> list = Arrays.asList(boms);
293        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
294        list.sort(ByteOrderMarkLengthComparator);
295        this.bomList = list;
296        this.byteOrderMark = readBom();
297    }
298
299    /**
300     * Constructs a new BOM InputStream that excludes the specified BOMs.
301     *
302     * @param delegate the InputStream to delegate to.
303     * @param boms     The BOMs to detect and exclude.
304     * @throws IOException if an error reading the first bytes of the stream occurs.
305     * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
306     */
307    @Deprecated
308    public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) throws IOException {
309        this(delegate, false, boms);
310    }
311
312    /**
313     * Finds a ByteOrderMark with the configured bytes in {@code bomList}.
314     *
315     * @return The matched BOM or null if none matched.
316     */
317    private ByteOrderMark find() {
318        return bomList.stream().filter(this::matches).findFirst().orElse(null);
319    }
320
321    /**
322     * Gets the ByteOrderMark (Byte Order Mark).
323     *
324     * @return The BOM or null if none matched.
325     */
326    public ByteOrderMark getBOM() {
327        return byteOrderMark;
328    }
329
330    /**
331     * Gets the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
332     *
333     * @return The BOM charset Name or null if no BOM found.
334     * @throws IOException if an error reading the first bytes of the stream occurs.
335     */
336    public String getBOMCharsetName() throws IOException {
337        return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
338    }
339
340    /**
341     * Tests whether the stream contains one of the specified BOMs.
342     *
343     * @return true if the stream has one of the specified BOMs, otherwise false if it does not.
344     * @throws IOException if an error reading the first bytes of the stream occurs.
345     */
346    public boolean hasBOM() throws IOException {
347        return getBOM() != null;
348    }
349
350    /**
351     * Tests whether the stream contains the specified BOM.
352     *
353     * @param bom The BOM to check for.
354     * @return true if the stream has the specified BOM, otherwise false if it does not.
355     * @throws IllegalArgumentException if the BOM is not one the stream is configured to detect.
356     * @throws IOException              if an error reading the first bytes of the stream occurs.
357     */
358    public boolean hasBOM(final ByteOrderMark bom) throws IOException {
359        if (!bomList.contains(bom)) {
360            throw new IllegalArgumentException("Stream not configured to detect " + bom);
361        }
362        return Objects.equals(getBOM(), bom);
363    }
364
365    /**
366     * Invokes the delegate's {@link InputStream#mark(int)} method.
367     *
368     * @param readLimit read ahead limit.
369     */
370    @Override
371    public synchronized void mark(final int readLimit) {
372        markFbIndex = fbIndex;
373        markedAtStart = firstBytes == null;
374        in.mark(readLimit);
375    }
376
377    /**
378     * Checks if the bytes match a BOM.
379     *
380     * @param bom The BOM.
381     * @return true if the bytes match the BOM, otherwise false.
382     */
383    private boolean matches(final ByteOrderMark bom) {
384        return bom.matches(firstBytes);
385    }
386
387    /**
388     * Invokes the delegate's {@code read()} method, detecting and optionally skipping BOM.
389     *
390     * @return the byte read (excluding BOM) or -1 if the end of stream.
391     * @throws IOException if an I/O error occurs.
392     */
393    @Override
394    public int read() throws IOException {
395        checkOpen();
396        final int b = readFirstBytes();
397        return b >= 0 ? b : in.read();
398    }
399
400    /**
401     * Invokes the delegate's {@link InputStream#read(byte[])} method, detecting and optionally skipping BOM.
402     *
403     * @param buf the buffer to read the bytes into, never {@code null}
404     * @return the number of bytes read (excluding BOM) or -1 if the end of stream.
405     * @throws NullPointerException if the buffer is {@code null}
406     * @throws IOException          if an I/O error occurs.
407     */
408    @Override
409    public int read(final byte[] buf) throws IOException {
410        return read(buf, 0, buf.length);
411    }
412
413    /**
414     * Invokes the delegate's {@link InputStream#read(byte[], int, int)} method, detecting and optionally skipping BOM.
415     *
416     * @param buf the buffer to read the bytes into.
417     * @param off The start offset.
418     * @param len The number of bytes to read (excluding BOM).
419     * @return the number of bytes read or -1 if the end of stream.
420     * @throws NullPointerException      if the buffer is {@code null}.
421     * @throws IndexOutOfBoundsException if {@code off} or {@code len} are negative, or if {@code off + len} is greater than {@code buf.length}.
422     * @throws IOException               if an I/O error occurs.
423     */
424    @Override
425    public int read(final byte[] buf, int off, int len) throws IOException {
426        IOUtils.checkFromIndexSize(buf, off, len);
427        if (len == 0) {
428            return 0;
429        }
430        int firstCount = 0;
431        int b = 0;
432        while (len > 0 && b >= 0) {
433            b = readFirstBytes();
434            if (b >= 0) {
435                buf[off++] = (byte) (b & 0xFF);
436                len--;
437                firstCount++;
438            }
439        }
440        final int secondCount = in.read(buf, off, len);
441        afterRead(secondCount);
442        return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount;
443    }
444
445    /**
446     * Reads the byte order mark.
447     *
448     * @return the byte order mark.
449     * @throws IOException if an error reading the first bytes of the stream occurs.
450     */
451    private ByteOrderMark readBom() throws IOException {
452        int fbLength = 0;
453        // BOMs are sorted from longest to shortest
454        final int maxBomSize = bomList.get(0).length();
455        final int[] tmp = new int[maxBomSize];
456        // Read first maxBomSize bytes
457        for (int i = 0; i < tmp.length; i++) {
458            tmp[i] = in.read();
459            afterRead(tmp[i]);
460            fbLength++;
461            if (tmp[i] < 0) {
462                break;
463            }
464        }
465        firstBytes = Arrays.copyOf(tmp, fbLength);
466        // match BOM in firstBytes
467        final ByteOrderMark bom = find();
468        if (bom != null && !include) {
469            if (bom.length() < firstBytes.length) {
470                fbIndex = bom.length();
471            } else {
472                firstBytes = new int[0];
473            }
474        }
475        return bom;
476    }
477
478    /**
479     * Reads and either preserves or skips the first bytes in the stream. This method behaves like the single-byte {@code read()} method, either returning a
480     * valid byte or -1 to indicate that the initial bytes have been processed already.
481     *
482     * @return the byte read (excluding BOM) or -1 if at the end of first bytes.
483     * @throws IOException if an I/O error occurs.
484     */
485    private int readFirstBytes() throws IOException {
486        return fbIndex < firstBytes.length ? firstBytes[fbIndex++] : EOF;
487    }
488
489    /**
490     * Invokes the delegate's {@link InputStream#reset()} method.
491     *
492     * @throws IOException if an I/O error occurs.
493     */
494    @Override
495    public synchronized void reset() throws IOException {
496        fbIndex = markFbIndex;
497        if (markedAtStart) {
498            firstBytes = null;
499        }
500        in.reset();
501    }
502
503    /**
504     * Invokes the delegate's {@link InputStream#skip(long)} method, detecting and optionally skipping BOM.
505     *
506     * @param n the number of bytes to skip.
507     * @return the number of bytes to skipped or -1 if the end of stream.
508     * @throws IOException if an I/O error occurs.
509     */
510    @Override
511    public long skip(final long n) throws IOException {
512        int skipped = 0;
513        while (n > skipped && readFirstBytes() >= 0) {
514            skipped++;
515        }
516        return in.skip(n - skipped) + skipped;
517    }
518}