1   package eu.fbk.knowledgestore.data;
2   
3   /*
4    * Copyright 2011 icedrake
5    * 
6    * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
7    * except in compliance with the License. You may obtain a copy of the License at
8    * 
9    * http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software distributed under the
12   * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13   * either express or implied. See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  import java.io.ByteArrayOutputStream;
18  import java.io.IOException;
19  import java.nio.CharBuffer;
20  
21  import com.google.common.base.Charsets;
22  
23  /**
24   * Smaz class for compression small strings. Port to java from <a
25   * href="https://github.com/antirez/smaz/">antirez</a> This class is immutable.
26   *
27   * @author icedrake
28   */
29  final class Smaz {
30  
31      private static final byte UNCOMPRESSED_FLAG = 1;
32  
33      /* Compression CODEBOOK, used for compression */
34      private static final String CODEBOOK[] = { "\002s,\266", "\003had\232\002leW", "\003on \216",
35              "", "\001yS", "\002ma\255\002li\227", "\003or \260", "", "\002ll\230\003s t\277",
36              "\004fromg\002mel", "", "\003its\332", "\001z\333", "\003ingF", "\001>\336",
37              "\001 \000\003   (\002nc\344", "\002nd=\003 on\312", "\002ne\213\003hat\276\003re q",
38              "", "\002ngT\003herz\004have\306\003s o\225", "", "\003ionk\003s a\254\002ly\352",
39              "\003hisL\003 inN\003 be\252", "", "\003 fo\325\003 of \003 ha\311", "", "\002of\005",
40              "\003 co\241\002no\267\003 ma\370", "", "", "\003 cl\356\003enta\003 an7",
41              "\002ns\300\001\"e", "\003n t\217\002ntP\003s, \205",
42              "\002pe\320\003 we\351\002om\223", "\002on\037", "", "\002y G", "\003 wa\271",
43              "\003 re\321\002or*", "", "\002=\"\251\002ot\337", "\003forD\002ou[", "\003 toR",
44              "\003 th\r", "\003 it\366", "\003but\261\002ra\202\003 wi\363\002</\361",
45              "\003 wh\237", "\002  4", "\003nd ?", "\002re!", "", "\003ng c", "",
46              "\003ly \307\003ass\323\001a\004\002rir", "", "", "", "\002se_", "\003of \"",
47              "\003div\364\002ros\003ere\240", "", "\002ta\310\001bZ\002si\324", "",
48              "\003and\u0007\002rs\335", "\002rt\362", "\002teE", "\003ati\316", "\002so\263",
49              "\002th\021", "\002tiJ\001c\034\003allp", "\003ate\345", "\002ss\246", "\002stM", "",
50              "\002><\346", "\002to\024", "\003arew", "\001d\030", "\002tr\303", "",
51              "\001\n1\003 a \222", "\003f tv\002veo", "\002un\340", "", "\003e o\242",
52              "\002a \243\002wa\326\001e\002", "\002ur\226\003e a\274", "\002us\244\003\n\r\n\247",
53              "\002ut\304\003e c\373", "\002we\221", "", "", "\002wh\302", "\001f,", "", "", "",
54              "\003d t\206", "", "", "\003th \343", "\001g;", "", "", "\001\r9\003e s\265",
55              "\003e t\234", "", "\003to Y", "\003e\r\n\236", "\002d \036\001h\022", "", "\001,Q",
56              "\002 a\031", "\002 b^", "\002\r\n\025\002 cI", "\002 d\245", "\002 e\253",
57              "\002 fh\001i\b\002e \013", "", "\002 hU\001-\314", "\002 i8", "", "", "\002 l\315",
58              "\002 m{", "\002f :\002 n\354", "\002 o\035", "\002 p}\001.n\003\r\n\r\250", "",
59              "\002 r\275", "\002 s>", "\002 t\016", "", "\002g \235\005which+\003whi\367",
60              "\002 w5", "\001/\305", "\003as \214", "\003at \207", "", "\003who\331", "",
61              "\001l\026\002h \212", "", "\002, $", "", "\004withV", "", "", "", "\001m-", "", "",
62              "\002ac\357", "\002ad\350", "\003TheH", "", "", "\004this\233\001n\t", "", "\002. y",
63              "", "\002alX\003e, \365", "\003tio\215\002be\\", "\002an\032\003ver\347", "",
64              "\004that0\003tha\313\001o\006", "\003was2", "\002arO", "\002as.",
65              "\002at'\003the\001\004they\200\005there\322\005theird", "\002ce\210", "\004were]",
66              "", "\002ch\231\002l \264\001p<", "", "", "\003one\256", "", "\003he \023\002dej",
67              "\003ter\270", "\002cou", "", "\002by\177\002di\201\002eax", "", "\002ec\327",
68              "\002edB", "\002ee\353", "", "", "\001r\f\002n )", "", "", "", "\002el\262", "",
69              "\003in i\002en3", "", "\002o `\001s\n", "", "\002er\033", "\003is t\002es6", "",
70              "\002ge\371", "\004.com\375", "\002fo\334\003our\330", "\003ch \301\001t\003",
71              "\002hab", "", "\003men\374", "", "\002he\020", "", "", "\001u&", "\002hif", "",
72              "\003not\204\002ic\203", "\003ed @\002id\355", "", "", "\002ho\273", "\002r K\001vm",
73              "", "", "", "\003t t\257\002il\360", "\002im\342", "\003en \317\002in\017",
74              "\002io\220", "\002s \027\001wA", "", "\003er |", "\003es ~\002is%", "\002it/", "",
75              "\002iv\272", "", "\002t #\u0007http://C\001x\372", "\002la\211", "\001<\341",
76              "\003, a\224" };
77  
78      /* Reverse compression CODEBOOK, used for decompression */
79      private static final String REVERSE_CODEBOOK[] = { " ", "the", "e", "t", "a", "of", "o",
80              "and", "i", "n", "s", "e ", "r", " th", " t", "in", "he", "th", "h", "he ", "to",
81              "\r\n", "l", "s ", "d", " a", "an", "er", "c", " o", "d ", "on", " of", "re", "of ",
82              "t ", ", ", "is", "u", "at", "   ", "n ", "or", "which", "f", "m", "as", "it", "that",
83              "\n", "was", "en", "  ", " w", "es", " an", " i", "\r", "f ", "g", "p", "nd", " s",
84              "nd ", "ed ", "w", "ed", "http://", "for", "te", "ing", "y ", "The", " c", "ti", "r ",
85              "his", "st", " in", "ar", "nt", ",", " to", "y", "ng", " h", "with", "le", "al",
86              "to ", "b", "ou", "be", "were", " b", "se", "o ", "ent", "ha", "ng ", "their", "\"",
87              "hi", "from", " f", "in ", "de", "ion", "me", "v", ".", "ve", "all", "re ", "ri",
88              "ro", "is ", "co", "f t", "are", "ea", ". ", "her", " m", "er ", " p", "es ", "by",
89              "they", "di", "ra", "ic", "not", "s, ", "d t", "at ", "ce", "la", "h ", "ne", "as ",
90              "tio", "on ", "n t", "io", "we", " a ", "om", ", a", "s o", "ur", "li", "ll", "ch",
91              "had", "this", "e t", "g ", "e\r\n", " wh", "ere", " co", "e o", "a ", "us", " d",
92              "ss", "\n\r\n", "\r\n\r", "=\"", " be", " e", "s a", "ma", "one", "t t", "or ", "but",
93              "el", "so", "l ", "e s", "s,", "no", "ter", " wa", "iv", "ho", "e a", " r", "hat",
94              "s t", "ns", "ch ", "wh", "tr", "ut", "/", "have", "ly ", "ta", " ha", " on", "tha",
95              "-", " l", "ati", "en ", "pe", " re", "there", "ass", "si", " fo", "wa", "ec", "our",
96              "who", "its", "z", "fo", "rs", ">", "ot", "un", "<", "im", "th ", "nc", "ate", "><",
97              "ver", "ad", " we", "ly", "ee", " n", "id", " cl", "ac", "il", "</", "rt", " wi",
98              "div", "e, ", " it", "whi", " ma", "ge", "x", "e c", "men", ".com" };
99  
100     /**
101      * Returns compressed byte array for the specified string
102      *
103      * @param strg
104      * @return byte array
105      * @throws IOException
106      */
107     public static byte[] compress(final String strg) {
108 
109         final ByteArrayOutputStream output = new ByteArrayOutputStream();
110 
111         if (!isOnlyAscii(strg)) {
112             final byte[] bytes = strg.getBytes(Charsets.UTF_8);
113             output.write(UNCOMPRESSED_FLAG);
114             output.write(bytes, 0, bytes.length);
115             return output.toByteArray();
116         }
117 
118         final StringBuilder verb = new StringBuilder();
119 
120         final CharBuffer charBuffer = CharBuffer.wrap(strg);
121         int inlen;
122 
123         // loop through input looking for matches in codebook
124         while ((inlen = charBuffer.remaining()) > 0) {
125             int h1, h2, h3;
126             charBuffer.mark();
127             h1 = h2 = charBuffer.get() << 3;
128             if (inlen > 1) {
129                 h2 += charBuffer.get();
130             }
131             if (inlen > 2) {
132                 h3 = h2 ^ charBuffer.get();
133             } else {
134                 h3 = 0;
135             }
136             charBuffer.reset();
137 
138             int j = 7;
139             if (j > inlen) {
140                 j = inlen;
141             }
142 
143             boolean found = false;
144 
145             /*
146              * Try to lookup substrings into the codebook, starting from the longer to the shorter
147              * substrings
148              */
149             for (; j > 0; j--) {
150                 CharBuffer slot;
151                 if (j == 1) {
152                     slot = CharBuffer.wrap(CODEBOOK[h1 % 241]);
153                 } else if (j == 2) {
154                     slot = CharBuffer.wrap(CODEBOOK[h2 % 241]);
155                 } else {
156                     slot = CharBuffer.wrap(CODEBOOK[h3 % 241]);
157                 }
158 
159                 final int slotLength = slot.length();
160                 int slotIndex = 0;
161                 int slotEndIndex = slotIndex + j + 1;
162                 while (slotLength > 0 && slotEndIndex <= slotLength) {
163                     if (slot.get(slotIndex) == j
164                             && inlen >= j
165                             && slot.subSequence(slotIndex + 1, slotEndIndex).toString()
166                                     .equals(charBuffer.subSequence(0, j).toString())) {
167                         // Match found in codebook
168                         // Add verbatim data if needed
169                         if (verb.length() > 0) {
170                             // output the verbatim data now
171                             outputVerb(output, verb.toString());
172                             verb.setLength(0);
173                         }
174 
175                         // Add encoded data and ditch unnecessary part of input
176                         // string
177                         output.write(slot.get(slot.get(slotIndex) + 1 + slotIndex));
178                         charBuffer.position(charBuffer.position() + j);
179                         inlen -= j;
180                         found = true;
181                         break;
182                     } else {
183                         slotIndex++;
184                         slotEndIndex = slotIndex + j + 1;
185                     }
186                 }
187             }
188 
189             // match not found, add to verbatim
190             if (!found) {
191                 if (inlen > 0) {
192                     inlen--;
193                     verb.append(charBuffer.subSequence(0, 1).toString());
194                 }
195                 charBuffer.position(charBuffer.position() + 1);
196             }
197 
198             // If the verbatim buffer is getting too long or we're at the end of
199             // the doc throw the verbatim buffer to the output queue
200             final int verbLength = verb.length();
201             if (verbLength == 255 || verbLength > 0 && inlen == 0) {
202                 outputVerb(output, verb.toString());
203                 verb.setLength(0);
204             }
205 
206         }
207         return output.toByteArray();
208     }
209 
210     /**
211      * Decompress byte array from compress back into String
212      *
213      * @param strBytes
214      * @return decompressed String
215      * @see Smaz#compress(String)
216      */
217     public static String decompress(final byte[] strBytes) {
218 
219         if (strBytes[0] == UNCOMPRESSED_FLAG) {
220             return new String(strBytes, 1, strBytes.length, Charsets.UTF_8);
221         }
222 
223         final StringBuilder out = new StringBuilder();
224 
225         for (int i = 0; i < strBytes.length; i++) {
226             final char b = (char) (0xFF & strBytes[i]);
227             if (b == 254) {
228                 out.append((char) strBytes[++i]);
229             } else if (b == 255) {
230                 final int length = 0xFF & strBytes[++i];
231                 for (int j = 1; j <= length; j++) {
232                     out.append((char) strBytes[i + j]);
233                 }
234                 i += length;
235             } else {
236                 final int loc = 0xFF & b;
237                 out.append(REVERSE_CODEBOOK[loc]);
238             }
239         }
240         return out.toString();
241     }
242 
243     private static boolean isOnlyAscii(final String input) {
244 
245         final char[] chars = input.toCharArray();
246 
247         for (final char c : chars) {
248             if (c <= 31 || c >= 127) {
249                 return false;
250             }
251         }
252 
253         return true;
254     }
255 
256     /**
257      * Outputs the verbatim string to the output stream
258      *
259      * @param baos
260      * @param str
261      */
262     private static void outputVerb(final ByteArrayOutputStream baos, final String str) {
263         if (str.length() == 1) {
264             baos.write(254);
265             baos.write(str.toCharArray()[0]);
266         } else {
267             final byte[] bytes = str.getBytes(Charsets.UTF_8);
268             baos.write(255);
269             baos.write(str.length());
270             baos.write(bytes, 0, bytes.length);
271         }
272     }
273 
274     private Smaz() {
275     }
276 
277 }