001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import static org.apache.commons.io.IOUtils.EOF; 020 021import java.io.BufferedReader; 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.InputStreamReader; 025import java.io.OutputStreamWriter; 026import java.io.Reader; 027import java.nio.ByteBuffer; 028import java.nio.CharBuffer; 029import java.nio.charset.Charset; 030import java.nio.charset.CharsetEncoder; 031import java.nio.charset.CoderResult; 032import java.nio.charset.CodingErrorAction; 033 034import org.apache.commons.io.Charsets; 035import org.apache.commons.io.IOUtils; 036import org.apache.commons.io.build.AbstractStreamBuilder; 037import org.apache.commons.io.charset.CharsetEncoders; 038 039/** 040 * {@link InputStream} implementation that reads a character stream from a {@link Reader} and transforms it to a byte stream using a specified charset encoding. 041 * The stream is transformed using a {@link CharsetEncoder} object, guaranteeing that all charset encodings supported by the JRE are handled correctly. In 042 * particular for charsets such as UTF-16, the implementation ensures that one and only one byte order marker is produced. 043 * <p> 044 * Since in general it is not possible to predict the number of characters to be read from the {@link Reader} to satisfy a read request on the 045 * {@link ReaderInputStream}, all reads from the {@link Reader} are buffered. There is therefore no well defined correlation between the current position of the 046 * {@link Reader} and that of the {@link ReaderInputStream}. This also implies that in general there is no need to wrap the underlying {@link Reader} in a 047 * {@link BufferedReader}. 048 * </p> 049 * <p> 050 * {@link ReaderInputStream} implements the inverse transformation of {@link InputStreamReader}; in the following example, reading from {@code in2} 051 * would return the same byte sequence as reading from {@code in} (provided that the initial byte sequence is legal with respect to the charset encoding): 052 * </p> 053 * <p> 054 * To build an instance, use {@link Builder}. 055 * </p> 056 * <pre> 057 * InputStream inputStream = ... 058 * Charset cs = ... 059 * InputStreamReader reader = new InputStreamReader(inputStream, cs); 060 * ReaderInputStream in2 = ReaderInputStream.builder() 061 * .setReader(reader) 062 * .setCharset(cs) 063 * .get(); 064 * </pre> 065 * <p> 066 * {@link ReaderInputStream} implements the same transformation as {@link OutputStreamWriter}, except that the control flow is reversed: both classes 067 * transform a character stream into a byte stream, but {@link OutputStreamWriter} pushes data to the underlying stream, while {@link ReaderInputStream} 068 * pulls it from the underlying stream. 069 * </p> 070 * <p> 071 * Note that while there are use cases where there is no alternative to using this class, very often the need to use this class is an indication of a flaw in 072 * the design of the code. This class is typically used in situations where an existing API only accepts an {@link InputStream}, but where the most natural way 073 * to produce the data is as a character stream, by providing a {@link Reader} instance. An example of a situation where this problem may appear is when 074 * implementing the {@code javax.activation.DataSource} interface from the Java Activation Framework. 075 * </p> 076 * <p> 077 * The {@link #available()} method of this class always returns 0. The methods {@link #mark(int)} and {@link #reset()} are not supported. 078 * </p> 079 * <p> 080 * Instances of {@link ReaderInputStream} are not thread safe. 081 * </p> 082 * 083 * @see Builder 084 * @see org.apache.commons.io.output.WriterOutputStream 085 * @since 2.0 086 */ 087public class ReaderInputStream extends AbstractInputStream { 088 089 // @formatter:off 090 /** 091 * Builds a new {@link ReaderInputStream}. 092 * 093 * <p> 094 * For example: 095 * </p> 096 * <pre>{@code 097 * ReaderInputStream s = ReaderInputStream.builder() 098 * .setPath(path) 099 * .setCharsetEncoder(Charset.defaultCharset().newEncoder()) 100 * .get();} 101 * </pre> 102 * 103 * @see #get() 104 * @since 2.12.0 105 */ 106 // @formatter:on 107 public static class Builder extends AbstractStreamBuilder<ReaderInputStream, Builder> { 108 109 private CharsetEncoder charsetEncoder = newEncoder(getCharset()); 110 111 /** 112 * Constructs a new builder of {@link ReaderInputStream}. 113 */ 114 public Builder() { 115 // empty 116 } 117 118 /** 119 * Builds a new {@link ReaderInputStream}. 120 * 121 * <p> 122 * You must set an aspect that supports {@link #getReader()}, otherwise, this method throws an exception. 123 * </p> 124 * <p> 125 * This builder uses the following aspects: 126 * </p> 127 * <ul> 128 * <li>{@link #getReader()} gets the target aspect.</li> 129 * <li>{@link #getBufferSize()}</li> 130 * <li>{@link #getCharset()}</li> 131 * <li>{@link CharsetEncoder}</li> 132 * </ul> 133 * 134 * @return a new instance. 135 * @throws UnsupportedOperationException if the origin cannot provide a {@link Reader}. 136 * @throws IllegalStateException if the {@code origin} is {@code null}. 137 * @throws IOException if an I/O error occurs converting to a {@link Reader} using {@link #getReader()}. 138 * @see #getReader() 139 * @see CharsetEncoder 140 * @see #getBufferSize() 141 * @see #getUnchecked() 142 */ 143 @Override 144 public ReaderInputStream get() throws IOException { 145 return new ReaderInputStream(this); 146 } 147 148 CharsetEncoder getCharsetEncoder() { 149 return charsetEncoder; 150 } 151 152 @Override 153 public Builder setCharset(final Charset charset) { 154 super.setCharset(charset); 155 charsetEncoder = newEncoder(getCharset()); 156 return this; 157 } 158 159 /** 160 * Sets the charset encoder. Assumes that the caller has configured the encoder. 161 * 162 * @param newEncoder the charset encoder, null resets to a default encoder. 163 * @return {@code this} instance. 164 */ 165 public Builder setCharsetEncoder(final CharsetEncoder newEncoder) { 166 charsetEncoder = CharsetEncoders.toCharsetEncoder(newEncoder, () -> newEncoder(getCharsetDefault())); 167 super.setCharset(charsetEncoder.charset()); 168 return this; 169 } 170 171 } 172 173 /** 174 * Constructs a new {@link Builder}. 175 * 176 * @return a new {@link Builder}. 177 * @since 2.12.0 178 */ 179 public static Builder builder() { 180 return new Builder(); 181 } 182 183 static int checkMinBufferSize(final CharsetEncoder charsetEncoder, final int bufferSize) { 184 final float minRequired = minBufferSize(charsetEncoder); 185 if (bufferSize < minRequired) { 186 throw new IllegalArgumentException(String.format("Buffer size %,d must be at least %s for a CharsetEncoder %s.", bufferSize, minRequired, 187 charsetEncoder.charset().displayName())); 188 } 189 return bufferSize; 190 } 191 192 static float minBufferSize(final CharsetEncoder charsetEncoder) { 193 return charsetEncoder.maxBytesPerChar() * 2; 194 } 195 196 private static CharsetEncoder newEncoder(final Charset charset) { 197 // @formatter:off 198 return Charsets.toCharset(charset).newEncoder() 199 .onMalformedInput(CodingErrorAction.REPLACE) 200 .onUnmappableCharacter(CodingErrorAction.REPLACE); 201 // @formatter:on 202 } 203 204 private final Reader reader; 205 206 private final CharsetEncoder charsetEncoder; 207 208 /** 209 * CharBuffer used as input for the decoder. It should be reasonably large as we read data from the underlying Reader into this buffer. 210 */ 211 private final CharBuffer encoderIn; 212 213 /** 214 * ByteBuffer used as output for the decoder. This buffer can be small as it is only used to transfer data from the decoder to the buffer provided by the 215 * caller. 216 */ 217 private final ByteBuffer encoderOut; 218 219 private CoderResult lastCoderResult; 220 221 private boolean endOfInput; 222 223 @SuppressWarnings("resource") // caller closes. 224 private ReaderInputStream(final Builder builder) throws IOException { 225 this(builder.getReader(), builder.charsetEncoder, builder.getBufferSize()); 226 } 227 228 /** 229 * Constructs a new {@link ReaderInputStream} that uses the virtual machine's {@linkplain Charset#defaultCharset() default charset} with a default input 230 * buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 231 * 232 * @param reader the target {@link Reader} 233 * @deprecated Use {@link ReaderInputStream#builder()} instead. 234 */ 235 @Deprecated 236 public ReaderInputStream(final Reader reader) { 237 this(reader, Charset.defaultCharset()); 238 } 239 240 /** 241 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 242 * 243 * <p> 244 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 245 * </p> 246 * 247 * @param reader the target {@link Reader} 248 * @param charset the charset encoding. 249 * @deprecated Use {@link ReaderInputStream#builder()} instead, will be protected for subclasses. 250 */ 251 @Deprecated 252 public ReaderInputStream(final Reader reader, final Charset charset) { 253 this(reader, charset, IOUtils.DEFAULT_BUFFER_SIZE); 254 } 255 256 /** 257 * Constructs a new {@link ReaderInputStream}. 258 * 259 * <p> 260 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 261 * </p> 262 * 263 * @param reader the target {@link Reader}. 264 * @param charset the charset encoding. 265 * @param bufferSize the size of the input buffer in number of characters. 266 * @deprecated Use {@link ReaderInputStream#builder()} instead. 267 */ 268 @Deprecated 269 public ReaderInputStream(final Reader reader, final Charset charset, final int bufferSize) { 270 this(reader, newEncoder(charset), bufferSize); 271 } 272 273 /** 274 * Constructs a new {@link ReaderInputStream}. 275 * 276 * <p> 277 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 278 * an encoder which had already been in use. 279 * </p> 280 * 281 * @param reader the target {@link Reader} 282 * @param charsetEncoder the charset encoder. 283 * @since 2.1 284 * @deprecated Use {@link ReaderInputStream#builder()} instead. 285 */ 286 @Deprecated 287 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder) { 288 this(reader, charsetEncoder, IOUtils.DEFAULT_BUFFER_SIZE); 289 } 290 291 /** 292 * Constructs a new {@link ReaderInputStream}. 293 * 294 * <p> 295 * This constructor does not call {@link CharsetEncoder#reset() reset} on the provided encoder. The caller of this constructor should do this when providing 296 * an encoder which had already been in use. 297 * </p> 298 * 299 * @param reader the target {@link Reader} 300 * @param charsetEncoder the charset encoder, null defaults to the default Charset encoder. 301 * @param bufferSize the size of the input buffer in number of characters. 302 * @since 2.1 303 * @deprecated Use {@link ReaderInputStream#builder()} instead. 304 */ 305 @Deprecated 306 public ReaderInputStream(final Reader reader, final CharsetEncoder charsetEncoder, final int bufferSize) { 307 this.reader = reader; 308 this.charsetEncoder = CharsetEncoders.toCharsetEncoder(charsetEncoder); 309 this.encoderIn = CharBuffer.allocate(checkMinBufferSize(this.charsetEncoder, bufferSize)); 310 this.encoderIn.flip(); 311 this.encoderOut = ByteBuffer.allocate(128); 312 this.encoderOut.flip(); 313 } 314 315 /** 316 * Constructs a new {@link ReaderInputStream} with a default input buffer size of {@value IOUtils#DEFAULT_BUFFER_SIZE} characters. 317 * 318 * <p> 319 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 320 * </p> 321 * 322 * @param reader the target {@link Reader} 323 * @param charsetName the name of the charset encoding. 324 * @deprecated Use {@link ReaderInputStream#builder()} instead. 325 */ 326 @Deprecated 327 public ReaderInputStream(final Reader reader, final String charsetName) { 328 this(reader, charsetName, IOUtils.DEFAULT_BUFFER_SIZE); 329 } 330 331 /** 332 * Constructs a new {@link ReaderInputStream}. 333 * 334 * <p> 335 * The encoder created for the specified charset will use {@link CodingErrorAction#REPLACE} for malformed input and unmappable characters. 336 * </p> 337 * 338 * @param reader the target {@link Reader} 339 * @param charsetName the name of the charset encoding, null maps to the default Charset. 340 * @param bufferSize the size of the input buffer in number of characters. 341 * @deprecated Use {@link ReaderInputStream#builder()} instead. 342 */ 343 @Deprecated 344 public ReaderInputStream(final Reader reader, final String charsetName, final int bufferSize) { 345 this(reader, Charsets.toCharset(charsetName), bufferSize); 346 } 347 348 @Override 349 public int available() throws IOException { 350 if (encoderOut.hasRemaining()) { 351 return encoderOut.remaining(); 352 } 353 return 0; 354 } 355 356 /** 357 * Closes the stream. This method will cause the underlying {@link Reader} to be closed. 358 * 359 * @throws IOException if an I/O error occurs. 360 */ 361 @Override 362 public void close() throws IOException { 363 reader.close(); 364 super.close(); 365 } 366 367 /** 368 * Fills the internal char buffer from the reader. 369 * 370 * @throws IOException If an I/O error occurs. 371 */ 372 private void fillBuffer() throws IOException { 373 if (endOfInput) { 374 return; 375 } 376 if (!endOfInput && (lastCoderResult == null || lastCoderResult.isUnderflow())) { 377 encoderIn.compact(); 378 final int position = encoderIn.position(); 379 // We don't use Reader#read(CharBuffer) here because it is more efficient 380 // to write directly to the underlying char array (the default implementation 381 // copies data to a temporary char array). 382 final int c = reader.read(encoderIn.array(), position, encoderIn.remaining()); 383 if (c == EOF) { 384 endOfInput = true; 385 } else { 386 encoderIn.position(position + c); 387 } 388 encoderIn.flip(); 389 } 390 encoderOut.compact(); 391 lastCoderResult = charsetEncoder.encode(encoderIn, encoderOut, endOfInput); 392 if (endOfInput) { 393 lastCoderResult = charsetEncoder.flush(encoderOut); 394 } 395 if (lastCoderResult.isError()) { 396 lastCoderResult.throwException(); 397 } 398 encoderOut.flip(); 399 } 400 401 /** 402 * Gets the CharsetEncoder. 403 * 404 * @return the CharsetEncoder. 405 */ 406 CharsetEncoder getCharsetEncoder() { 407 return charsetEncoder; 408 } 409 410 /** 411 * Reads a single byte. 412 * 413 * @return either the byte read or {@code -1} if the end of the stream has been reached. 414 * @throws IOException if an I/O error occurs. 415 */ 416 @Override 417 public int read() throws IOException { 418 checkOpen(); 419 for (;;) { 420 if (encoderOut.hasRemaining()) { 421 return encoderOut.get() & 0xFF; 422 } 423 fillBuffer(); 424 if (endOfInput && !encoderOut.hasRemaining()) { 425 return EOF; 426 } 427 } 428 } 429 430 /** 431 * Reads the specified number of bytes into an array. 432 * 433 * @param b the byte array to read into, must not be {@code null} 434 * @return the number of bytes read or {@code -1} if the end of the stream has been reached. 435 * @throws NullPointerException if the byte array is {@code null}. 436 * @throws IOException if an I/O error occurs. 437 */ 438 @Override 439 public int read(final byte[] b) throws IOException { 440 return read(b, 0, b.length); 441 } 442 443 /** 444 * Reads the specified number of bytes into an array. 445 * 446 * @param array the byte array to read into. 447 * @param off the offset to start reading bytes into. 448 * @param len the number of bytes to read. 449 * @return the number of bytes read or {@code -1} if the end of the stream has been reached. 450 * @throws NullPointerException if the byte array is {@code null}. 451 * @throws IndexOutOfBoundsException if {@code off} or {@code len} are negative, or if {@code off + len} is greater than {@code array.length}. 452 * @throws IOException if an I/O error occurs. 453 */ 454 @Override 455 public int read(final byte[] array, int off, int len) throws IOException { 456 IOUtils.checkFromIndexSize(array, off, len); 457 if (len == 0) { 458 return 0; // Always return 0 if len == 0 459 } 460 int read = 0; 461 while (len > 0) { 462 if (encoderOut.hasRemaining()) { // Data from the last read not fully copied 463 final int c = Math.min(encoderOut.remaining(), len); 464 encoderOut.get(array, off, c); 465 off += c; 466 len -= c; 467 read += c; 468 } else if (endOfInput) { // Already reach EOF in the last read 469 break; 470 } else { // Read again 471 fillBuffer(); 472 } 473 } 474 return read == 0 && endOfInput ? EOF : read; 475 } 476}