1   package eu.fbk.knowledgestore.data;
2   
3   import java.io.ByteArrayInputStream;
4   import java.io.Closeable;
5   import java.io.File;
6   import java.io.FileInputStream;
7   import java.io.FileNotFoundException;
8   import java.io.IOException;
9   import java.io.InputStream;
10  import java.io.InputStreamReader;
11  import java.io.OutputStream;
12  import java.io.Reader;
13  import java.net.URL;
14  import java.net.URLConnection;
15  import java.nio.ByteBuffer;
16  import java.nio.CharBuffer;
17  import java.nio.charset.Charset;
18  import java.nio.charset.CharsetEncoder;
19  import java.nio.charset.CoderResult;
20  import java.nio.charset.CodingErrorAction;
21  import java.util.Date;
22  
23  import com.google.common.base.Charsets;
24  import com.google.common.base.Preconditions;
25  import com.google.common.base.Throwables;
26  import com.google.common.io.ByteStreams;
27  import com.google.common.io.CharSource;
28  import com.google.common.io.CharStreams;
29  import com.google.common.net.MediaType;
30  
31  import org.openrdf.model.URI;
32  import org.slf4j.Logger;
33  import org.slf4j.LoggerFactory;
34  
35  import eu.fbk.knowledgestore.vocabulary.KS;
36  import eu.fbk.knowledgestore.vocabulary.NFO;
37  import eu.fbk.knowledgestore.vocabulary.NIE;
38  import eu.fbk.rdfpro.util.IO;
39  
40  /**
41   * A digital representation of a resource.
42   * <p>
43   * A {@code Representation} object provides access to the binary or character representation of a
44   * resource, including associated (mutable) representation metadata ({@link #getMetadata()}) and
45   * the resource ID ( {@link #getResourceID()}).
46   * </p>
47   * <p>
48   * Representation data can be consumed either as a stream of bytes or as a stream of characters.
49   * Conversion from one stream to another is performed if necessary using the charset encoded by
50   * metadata attribute {@link NIE#MIME_TYPE}.
51   * </p>
52   * <p>
53   * A representation encapsulates an open {@code InputStream} or {@code Reader} that provide access
54   * to the representation data (see methods {@link #getInputStream()} and {@link #getReader()}),
55   * hence it is a {@code Closeable} object that MUST be closed after use. In addition to these
56   * methods, a number of {@code writeToXXX()} helper methods allow to consume the representation
57   * data in different ways:
58   * </p>
59   * <ul>
60   * <li>{@link #writeToByteArray()} returns a byte array with all the representation data;</li>
61   * <li>{@link #writeToString()} returns a string with all the representation characater data.</li>
62   * <li>{@link #writeTo(OutputStream)} writes all the binary representation data to a supplied
63   * {@code OutputStream};</li>
64   * <li>{@link #writeTo(Appendable)} writes the character representation data to a supplied
65   * {@code Appendable};</li>
66   * </ul>
67   * <p>
68   * Note that these methods exhaust the {@code InputStream} / {@code Reader} associated to the
69   * representation. Moreover, in case some data has already been read, it will not be written by
70   * those methods.
71   * </p>
72   * <p>
73   * Representation objects are created via {@code create()} factory methods that take care of
74   * acquiring both an {@code InputStream} / {@code Reader} over the data and the associated
75   * metadata starting from a number of sources:
76   * </p>
77   * <ul>
78   * <li>{@link #create(URI, InputStream)} builds a representation out of an {@code InputStream};</li>
79   * <li>{@link #create(URI, Reader)} builds a representation out of a {@code Reader} ;</li>
80   * <li>{@link #create(URI, byte[])} builds a representation out of a byte array, including
81   * metadata about the representation length;</li>
82   * <li>{@link #create(URI, CharSequence)} builds a representation out of a {@code CharSequence}
83   * (e.g., a {@code String});</li>
84   * <li>{@link #create(URI, File)} builds a representation out of a file, including metadata about
85   * the file name, size, mime type (from the extension) and last modified time;</li>
86   * <li>{@link #create(URI, URL)} builds a representation out of a resolvable URL, including
87   * metadata about the file name, file size, mime type (from the extension), MD5 hash, last
88   * modified time.</li>
89   * </ul>
90   * <p>
91   * Representation objects are mutable but thread safe. Object equality is used for
92   * {@code equals()} and {@code hashCode()}.
93   * </p>
94   */
95  public final class Representation implements Closeable {
96  
97      private static final Logger LOGGER = LoggerFactory.getLogger(Representation.class);
98  
99      private final Closeable data; // InputStream or Reader
100 
101     private final Record metadata;
102 
103     private Representation(final Closeable data) {
104         this.data = Preconditions.checkNotNull(data);
105         this.metadata = Record.create(null, KS.REPRESENTATION);
106     }
107 
108     private Charset getCharset() {
109         final String mimeType = this.metadata.getUnique(NIE.MIME_TYPE, String.class);
110         if (mimeType == null) {
111             return Charsets.UTF_8;
112         }
113         try {
114             return MediaType.parse(mimeType).charset().or(Charsets.UTF_8);
115         } catch (final Throwable ex) {
116             throw new IllegalArgumentException("Invalid mime type in metadata: " + mimeType, ex);
117         }
118     }
119 
120     @Override
121     protected void finalize() throws Throwable {
122         try {
123             close();
124         } finally {
125             super.finalize();
126         }
127     }
128 
129     /**
130      * Creates a representation based on the {@code InputStream} specified. Note that the supplied
131      * {@code InputStream} is never closed by this class: it MUST be closed externally under the
132      * responsibility of the caller.
133      *
134      * @param stream
135      *            the {@code InputStream}, not null
136      * @return the created representation
137      */
138     public static Representation create(final InputStream stream) {
139         return new Representation(stream);
140     }
141 
142     /**
143      * Creates a representation based on the byte array specified. The length of the byte array
144      * will be reflected in the returned representation metadata (property {@link NFO#FILE_SIZE}).
145      * Note that the byte array should not be changed after calling this method, as modification
146      * could be (partially) reflected in the returned representation.
147      *
148      * @param bytes
149      *            the byte array containing the binary data of the representation
150      * @return the created representation
151      */
152     public static Representation create(final byte[] bytes) {
153         final Representation representation = new Representation(new ByteArrayInputStream(bytes));
154         representation.metadata.set(NFO.FILE_SIZE, (long) bytes.length);
155         return representation;
156     }
157 
158     /**
159      * Creates a representation based on the {@code File} specified. The file length, size,
160      * creation time and MIME type will be reflected in the returned representation metadata
161      * (respectively, properties {@link NFO#FILE_SIZE}, {@link NFO#FILE_NAME},
162      * {@link NFO#FILE_LAST_MODIFIED}, {@link NIE#MIME_TYPE}). Note that this method causes the
163      * file to be opened for reading.
164      *
165      * @param file
166      *            the file containing the binary data of the representation
167      * @param autoDecompress
168      *            automatically decompress the file, if compressed with gzip, bzip2, xz, 7z or lz4
169      * @return the created representation
170      * @throws IllegalArgumentException
171      *             in case the file does not exist
172      */
173     public static Representation create(final File file, final boolean autoDecompress)
174             throws IllegalArgumentException {
175         try {
176             String name = file.getName();
177             final Representation representation;
178             if (autoDecompress) {
179                 byte[] bytes = ByteStreams.toByteArray(IO.read(file.getAbsolutePath()));
180                 representation = new Representation(new ByteArrayInputStream(bytes));
181                 if (name.endsWith(".gz") || name.endsWith(".xz") || name.endsWith(".7z")) {
182                     name = name.substring(0, name.length() - 3);
183                 } else if (name.endsWith(".bz2") || name.endsWith(".lz4")) {
184                     name = name.substring(0, name.length() - 4);
185                 }
186             } else {
187                 representation = new Representation(IO.buffer(new FileInputStream(file)));
188             }
189             representation.metadata.set(NFO.FILE_SIZE, file.length());
190             representation.metadata.set(NFO.FILE_NAME, name);
191             representation.metadata.set(NFO.FILE_LAST_MODIFIED, new Date(file.lastModified()));
192             representation.metadata.set(NIE.MIME_TYPE, Data.extensionToMimeType(name));
193             return representation;
194         } catch (final FileNotFoundException ex) {
195             throw new IllegalArgumentException("Not a file: " + file.getAbsolutePath());
196         } catch (final IOException e) {
197             throw new IllegalArgumentException("IOException on file: " + file.getAbsolutePath());
198         }
199     }
200 
201     /**
202      * Creates a representation based on the resolvable URL specified. This method has the effect
203      * of acquiring a connection to the supplied URL, from which the representation stream and a
204      * number of metadata attributes are extracted. These attributes include the last modified
205      * timestamp ({@link NFO#FILE_LAST_MODIFIED}), the MIME type ({@link NIE#MIME_TYPE}), the file
206      * size ({@link NFO#FILE_SIZE}), the file name ({@link NFO#FILE_NAME}) and the MD5 hash (
207      * {@link NFO#HAS_HASH}); all of these attributes are optional and are extracted only if
208      * available.
209      *
210      * @param url
211      *            the URL that, resolved, will produced the binary data of the representation
212      * @return the created representation
213      * @throws IllegalArgumentException
214      *             in case acquiring a connection to the supplied URL fails
215      */
216     public static Representation create(final URL url) throws IllegalArgumentException {
217 
218         // Acquire a connection and open an InputStream over its entity content.
219         URLConnection connection;
220         InputStream stream;
221         try {
222             connection = url.openConnection();
223             connection.connect();
224             stream = connection.getInputStream();
225         } catch (final IOException ex) {
226             throw new IllegalArgumentException("Cannot acquire a connection to URL " + url, ex);
227         }
228 
229         // Wrap the stream in a Representation object.
230         final Representation representation = new Representation(stream);
231 
232         try {
233             // Extract last modified.
234             final long lastModified = connection.getLastModified();
235             if (lastModified != 0) {
236                 representation.metadata.set(NFO.FILE_LAST_MODIFIED, new Date(lastModified));
237             }
238 
239             // Extract MIME type from "Content-Type".
240             String mimeType = connection.getContentType();
241             if (mimeType == null) {
242                 mimeType = Data.extensionToMimeType(url.getFile());
243             }
244             representation.metadata.set(NIE.MIME_TYPE, mimeType);
245 
246             // Extract length from "Content-Length";
247             final int length = connection.getContentLength();
248             if (length >= 0) {
249                 representation.metadata.set(NFO.FILE_SIZE, length);
250             }
251 
252             // Extract the filename either from "Content-Disposition" header or from URL.
253             String filename = null;
254             final String disposition = connection.getHeaderField("Content-Disposition");
255             if (disposition != null && disposition.contains("filename")) {
256                 final int start = Math.max(disposition.indexOf('\"'), disposition.indexOf('\''));
257                 if (start > 0) {
258                     final int end = Math.max(disposition.lastIndexOf('\"'),
259                             disposition.lastIndexOf('\''));
260                     if (end > 0) {
261                         filename = disposition.substring(start + 1, end);
262                     }
263                 }
264             }
265             if (filename == null) {
266                 final String path = url.getPath();
267                 final int index = path.lastIndexOf('/');
268                 if (index >= 0) {
269                     filename = path.substring(index + 1);
270                 }
271 
272             }
273             representation.metadata.set(NFO.FILE_NAME, filename);
274 
275             // Extract the MD5 hash from "Content-MD5".
276             final String md5 = connection.getHeaderField("Content-MD5");
277             if (md5 != null) {
278                 final Record hash = Record.create();
279                 hash.set(NFO.HASH_ALGORITHM, "MD5");
280                 hash.set(NFO.HASH_VALUE, md5);
281                 representation.metadata.set(NFO.HAS_HASH, hash);
282             }
283 
284             // Return the representation built.
285             return representation;
286 
287         } catch (final Throwable ex) {
288             // Ensure to close the connection if something goes wrong.
289             try {
290                 connection.getInputStream().close();
291             } catch (final Throwable ex2) {
292                 // ignore
293             }
294             throw Throwables.propagate(ex);
295         }
296     }
297 
298     /**
299      * Creates a representation based on the {@code Reader} specified. Note that the supplied
300      * {@code Reader} is never closed by this class: it MUST be closed externally under the
301      * responsibility of the caller. Upon request (e.g., invocation of {@link #getInputStream()}),
302      * character data produced by the {@code Reader} will be translated into byte data either
303      * using the charset specified in the representation metadata (property {@link NIE#MIME_TYPE})
304      * or by using UTF-8.
305      *
306      * @param reader
307      *            the reader producing the character data of the representation
308      * @return the created representation
309      */
310     public static Representation create(final Reader reader) {
311         Preconditions.checkNotNull(reader);
312         return new Representation(reader);
313     }
314 
315     /**
316      * Creates a representation based on the {@code CharSequence} specified. Upon request (e.g.,
317      * invocation of {@link #getInputStream()}), character data produced by the {@code Reader}
318      * will be translated into byte data either using the charset specified in the representation
319      * metadata (property {@link NIE#MIME_TYPE}) or by using UTF-8.
320      *
321      * @param sequence
322      *            the {@code CharSequence} with the character data of the representation
323      * @return the created representation
324      */
325     public static Representation create(final CharSequence sequence) {
326         try {
327             return new Representation(CharSource.wrap(sequence).openStream());
328         } catch (final IOException ex) {
329             throw new Error("Unexpected exception (!): " + ex.getMessage(), ex);
330         }
331     }
332 
333     /**
334      * Returns the metadata about this representation.
335      *
336      * @return the representation metadata, not null
337      */
338     public Record getMetadata() {
339         return this.metadata;
340     }
341 
342     /**
343      * Returns an {@code InputStream} over the binary data of this representation object.
344      * Conversion from character to byte data, if required, is performed according to the charset
345      * specified by the MIME type metadata property ({@link NIE#MIME_TYPE}).
346      *
347      * @return an {@code InputStream} over the binary content of this representation
348      */
349     public InputStream getInputStream() {
350         if (this.data instanceof InputStream) {
351             return (InputStream) this.data;
352         } else {
353             final Reader reader = (Reader) this.data;
354             return new ReaderInputStream(reader, getCharset());
355         }
356     }
357 
358     /**
359      * Returns a {@code Reader} over the character data of this representation object. Conversion
360      * from byte to character data, if required, is performed according to the charset specified
361      * by the MIME type metadata property ({@link NIE#MIME_TYPE}).
362      *
363      * @return a {@code Reader} providing access to the character data of the representation.
364      */
365     public Reader getReader() {
366         if (this.data instanceof Reader) {
367             return (Reader) this.data;
368         } else {
369             final InputStream stream = (InputStream) this.data;
370             return new InputStreamReader(stream, getCharset());
371         }
372     }
373 
374     /**
375      * Writes all the binary data of this representation to a {@code byte[]} object. Conversion
376      * from character to byte data, if required, is performed according to the charset specified
377      * by the MIME type metadata property ({@link NIE#MIME_TYPE}). If some data has been already
378      * read via {@code getInputStream()} or {@code getReaer()}, it will not be returned in the
379      * result.
380      *
381      * @return a byte array with the binary content of this representation
382      * @throws IOException
383      *             in case access to binary data fails
384      */
385     public byte[] writeToByteArray() throws IOException {
386         final InputStream stream = getInputStream();
387         try {
388             return ByteStreams.toByteArray(stream);
389         } finally {
390             stream.close();
391         }
392     }
393 
394     /**
395      * Writes all the character data of this representation to a {@code String} object. Conversion
396      * from byte to character data, if required, is performed according to the charset specified
397      * by the MIME type metadata property ({@link NIE#MIME_TYPE}). If some data has been already
398      * read via {@code getInputStream()} or {@code getReaer()}, it will not be returned in the
399      * result.
400      *
401      * @return a {@code String} containg the full character-based content of this representation
402      * @throws IOException
403      *             in case access to binary data fails
404      */
405     public String writeToString() throws IOException {
406         final Reader reader = getReader();
407         try {
408             return CharStreams.toString(reader);
409         } finally {
410             reader.close();
411         }
412     }
413 
414     /**
415      * Writes all the binary data of this representation to the {@code OutputStream} sink
416      * specified. Conversion from character to byte data, if required, is performed according to
417      * the charset specified by the MIME type metadata property ({@link NIE#MIME_TYPE}). If some
418      * data has been already read via {@code getInputStream()} or {@code getReaer()}, it will not
419      * be written to the supplied sink.
420      *
421      * @param sink
422      *            the sink where to write binary data to
423      * @throws IOException
424      *             in case access to binary data fails
425      */
426     public void writeTo(final OutputStream sink) throws IOException {
427         final InputStream in = getInputStream();
428         try {
429             ByteStreams.copy(in, sink);
430         } finally {
431             in.close();
432         }
433     }
434 
435     /**
436      * Writes all the character data of this representation to the {@code Appendable} sink
437      * specified. Conversion from byte to character data, if required, is performed according to
438      * the charset specified by the MIME type metadata property ({@link NIE#MIME_TYPE}). If some
439      * data has been already read via {@code getInputStream()} or {@code getReaer()}, it will not
440      * be written to the supplied sink.
441      *
442      * @param sink
443      *            the sink where to write character data to
444      * @throws IOException
445      *             in case access to binary data fails
446      */
447     public void writeTo(final Appendable sink) throws IOException {
448         final Reader reader = getReader();
449         try {
450             CharStreams.copy(reader, sink);
451         } finally {
452             reader.close();
453         }
454     }
455 
456     @Override
457     public void close() {
458         try {
459             this.data.close();
460         } catch (final Exception ex) {
461             LOGGER.warn("Exception caught while closing representation", ex);
462         }
463     }
464 
465     /**
466      * {@inheritDoc} The returned representation contains the associated resource ID.
467      */
468     @Override
469     public String toString() {
470         final String file = this.metadata.getUnique(NFO.FILE_NAME, String.class, "unnamed file");
471         final String type = this.metadata.getUnique(NIE.MIME_TYPE, String.class, "unknown type");
472         final long size = this.metadata.getUnique(NFO.FILE_SIZE, Long.class, -1L);
473         return file + ", " + type + ", " + (size >= 0 ? size + " bytes" : "unknown size");
474     }
475 
476     // Source: org.apache.commons.io.input.ReaderInputStream
477     private class ReaderInputStream extends InputStream {
478 
479         private static final int BUFFER_SIZE = 1024;
480 
481         private final Reader reader;
482 
483         private final CharsetEncoder enc;
484 
485         private final CharBuffer encIn;
486 
487         private final ByteBuffer encOut;
488 
489         private CoderResult lastCoderResult;
490 
491         private boolean eof;
492 
493         ReaderInputStream(final Reader reader, final Charset charset) {
494 
495             this.reader = reader;
496             this.enc = charset.newEncoder().onMalformedInput(CodingErrorAction.REPLACE)
497                     .onUnmappableCharacter(CodingErrorAction.REPLACE);
498             this.encIn = CharBuffer.allocate(BUFFER_SIZE);
499             this.encIn.flip();
500             this.encOut = ByteBuffer.allocate(128);
501             this.encOut.flip();
502         }
503 
504         private void fillBuffer() throws IOException {
505 
506             if (!this.eof && (this.lastCoderResult == null || this.lastCoderResult.isUnderflow())) {
507                 this.encIn.compact();
508                 final int p = this.encIn.position();
509                 final int c = this.reader.read(this.encIn.array(), p, this.encIn.remaining());
510                 if (c == -1) {
511                     this.eof = true;
512                 } else {
513                     this.encIn.position(p + c);
514                 }
515                 this.encIn.flip();
516             }
517 
518             this.encOut.compact();
519             this.lastCoderResult = this.enc.encode(this.encIn, this.encOut, this.eof);
520             this.encOut.flip();
521         }
522 
523         @Override
524         public int read(final byte[] b, final int offset, final int length) throws IOException {
525 
526             Preconditions.checkNotNull(b);
527             Preconditions.checkPositionIndex(offset, b.length);
528             Preconditions.checkPositionIndex(offset + length, b.length);
529 
530             int read = 0;
531             int o = offset;
532             int l = length;
533 
534             while (l > 0) {
535                 if (this.encOut.hasRemaining()) {
536                     final int c = Math.min(this.encOut.remaining(), l);
537                     this.encOut.get(b, o, c);
538                     o += c;
539                     l -= c;
540                     read += c;
541                 } else {
542                     fillBuffer();
543                     if (this.eof && !this.encOut.hasRemaining()) {
544                         break;
545                     }
546                 }
547             }
548 
549             return read > 0 || !this.eof ? read : l == 0 ? 0 : -1;
550         }
551 
552         @Override
553         public int read() throws IOException {
554 
555             for (;;) {
556                 if (this.encOut.hasRemaining()) {
557                     return this.encOut.get() & 0xFF;
558                 } else {
559                     fillBuffer();
560                     if (this.eof && !this.encOut.hasRemaining()) {
561                         return -1;
562                     }
563                 }
564             }
565         }
566 
567         @Override
568         public void close() throws IOException {
569             this.reader.close();
570         }
571 
572     }
573 
574 }