1   package eu.fbk.knowledgestore.data;
2   
3   
4   
5   
6   
7   
8   
9   
10  
11  
12  
13  
14  
15  
16  
17  import java.io.ByteArrayOutputStream;
18  import java.io.IOException;
19  import java.nio.CharBuffer;
20  
21  import com.google.common.base.Charsets;
22  
23  
24  
25  
26  
27  
28  
29  final class Smaz {
30  
31      private static final byte UNCOMPRESSED_FLAG = 1;
32  
33      
34      private static final String CODEBOOK[] = { "\002s,\266", "\003had\232\002leW", "\003on \216",
35              "", "\001yS", "\002ma\255\002li\227", "\003or \260", "", "\002ll\230\003s t\277",
36              "\004fromg\002mel", "", "\003its\332", "\001z\333", "\003ingF", "\001>\336",
37              "\001 \000\003   (\002nc\344", "\002nd=\003 on\312", "\002ne\213\003hat\276\003re q",
38              "", "\002ngT\003herz\004have\306\003s o\225", "", "\003ionk\003s a\254\002ly\352",
39              "\003hisL\003 inN\003 be\252", "", "\003 fo\325\003 of \003 ha\311", "", "\002of\005",
40              "\003 co\241\002no\267\003 ma\370", "", "", "\003 cl\356\003enta\003 an7",
41              "\002ns\300\001\"e", "\003n t\217\002ntP\003s, \205",
42              "\002pe\320\003 we\351\002om\223", "\002on\037", "", "\002y G", "\003 wa\271",
43              "\003 re\321\002or*", "", "\002=\"\251\002ot\337", "\003forD\002ou[", "\003 toR",
44              "\003 th\r", "\003 it\366", "\003but\261\002ra\202\003 wi\363\002</\361",
45              "\003 wh\237", "\002  4", "\003nd ?", "\002re!", "", "\003ng c", "",
46              "\003ly \307\003ass\323\001a\004\002rir", "", "", "", "\002se_", "\003of \"",
47              "\003div\364\002ros\003ere\240", "", "\002ta\310\001bZ\002si\324", "",
48              "\003and\u0007\002rs\335", "\002rt\362", "\002teE", "\003ati\316", "\002so\263",
49              "\002th\021", "\002tiJ\001c\034\003allp", "\003ate\345", "\002ss\246", "\002stM", "",
50              "\002><\346", "\002to\024", "\003arew", "\001d\030", "\002tr\303", "",
51              "\001\n1\003 a \222", "\003f tv\002veo", "\002un\340", "", "\003e o\242",
52              "\002a \243\002wa\326\001e\002", "\002ur\226\003e a\274", "\002us\244\003\n\r\n\247",
53              "\002ut\304\003e c\373", "\002we\221", "", "", "\002wh\302", "\001f,", "", "", "",
54              "\003d t\206", "", "", "\003th \343", "\001g;", "", "", "\001\r9\003e s\265",
55              "\003e t\234", "", "\003to Y", "\003e\r\n\236", "\002d \036\001h\022", "", "\001,Q",
56              "\002 a\031", "\002 b^", "\002\r\n\025\002 cI", "\002 d\245", "\002 e\253",
57              "\002 fh\001i\b\002e \013", "", "\002 hU\001-\314", "\002 i8", "", "", "\002 l\315",
58              "\002 m{", "\002f :\002 n\354", "\002 o\035", "\002 p}\001.n\003\r\n\r\250", "",
59              "\002 r\275", "\002 s>", "\002 t\016", "", "\002g \235\005which+\003whi\367",
60              "\002 w5", "\001/\305", "\003as \214", "\003at \207", "", "\003who\331", "",
61              "\001l\026\002h \212", "", "\002, $", "", "\004withV", "", "", "", "\001m-", "", "",
62              "\002ac\357", "\002ad\350", "\003TheH", "", "", "\004this\233\001n\t", "", "\002. y",
63              "", "\002alX\003e, \365", "\003tio\215\002be\\", "\002an\032\003ver\347", "",
64              "\004that0\003tha\313\001o\006", "\003was2", "\002arO", "\002as.",
65              "\002at'\003the\001\004they\200\005there\322\005theird", "\002ce\210", "\004were]",
66              "", "\002ch\231\002l \264\001p<", "", "", "\003one\256", "", "\003he \023\002dej",
67              "\003ter\270", "\002cou", "", "\002by\177\002di\201\002eax", "", "\002ec\327",
68              "\002edB", "\002ee\353", "", "", "\001r\f\002n )", "", "", "", "\002el\262", "",
69              "\003in i\002en3", "", "\002o `\001s\n", "", "\002er\033", "\003is t\002es6", "",
70              "\002ge\371", "\004.com\375", "\002fo\334\003our\330", "\003ch \301\001t\003",
71              "\002hab", "", "\003men\374", "", "\002he\020", "", "", "\001u&", "\002hif", "",
72              "\003not\204\002ic\203", "\003ed @\002id\355", "", "", "\002ho\273", "\002r K\001vm",
73              "", "", "", "\003t t\257\002il\360", "\002im\342", "\003en \317\002in\017",
74              "\002io\220", "\002s \027\001wA", "", "\003er |", "\003es ~\002is%", "\002it/", "",
75              "\002iv\272", "", "\002t #\u0007http://C\001x\372", "\002la\211", "\001<\341",
76              "\003, a\224" };
77  
78      
79      private static final String REVERSE_CODEBOOK[] = { " ", "the", "e", "t", "a", "of", "o",
80              "and", "i", "n", "s", "e ", "r", " th", " t", "in", "he", "th", "h", "he ", "to",
81              "\r\n", "l", "s ", "d", " a", "an", "er", "c", " o", "d ", "on", " of", "re", "of ",
82              "t ", ", ", "is", "u", "at", "   ", "n ", "or", "which", "f", "m", "as", "it", "that",
83              "\n", "was", "en", "  ", " w", "es", " an", " i", "\r", "f ", "g", "p", "nd", " s",
84              "nd ", "ed ", "w", "ed", "http://", "for", "te", "ing", "y ", "The", " c", "ti", "r ",
85              "his", "st", " in", "ar", "nt", ",", " to", "y", "ng", " h", "with", "le", "al",
86              "to ", "b", "ou", "be", "were", " b", "se", "o ", "ent", "ha", "ng ", "their", "\"",
87              "hi", "from", " f", "in ", "de", "ion", "me", "v", ".", "ve", "all", "re ", "ri",
88              "ro", "is ", "co", "f t", "are", "ea", ". ", "her", " m", "er ", " p", "es ", "by",
89              "they", "di", "ra", "ic", "not", "s, ", "d t", "at ", "ce", "la", "h ", "ne", "as ",
90              "tio", "on ", "n t", "io", "we", " a ", "om", ", a", "s o", "ur", "li", "ll", "ch",
91              "had", "this", "e t", "g ", "e\r\n", " wh", "ere", " co", "e o", "a ", "us", " d",
92              "ss", "\n\r\n", "\r\n\r", "=\"", " be", " e", "s a", "ma", "one", "t t", "or ", "but",
93              "el", "so", "l ", "e s", "s,", "no", "ter", " wa", "iv", "ho", "e a", " r", "hat",
94              "s t", "ns", "ch ", "wh", "tr", "ut", "/", "have", "ly ", "ta", " ha", " on", "tha",
95              "-", " l", "ati", "en ", "pe", " re", "there", "ass", "si", " fo", "wa", "ec", "our",
96              "who", "its", "z", "fo", "rs", ">", "ot", "un", "<", "im", "th ", "nc", "ate", "><",
97              "ver", "ad", " we", "ly", "ee", " n", "id", " cl", "ac", "il", "</", "rt", " wi",
98              "div", "e, ", " it", "whi", " ma", "ge", "x", "e c", "men", ".com" };
99  
100     
101 
102 
103 
104 
105 
106 
107     public static byte[] compress(final String strg) {
108 
109         final ByteArrayOutputStream output = new ByteArrayOutputStream();
110 
111         if (!isOnlyAscii(strg)) {
112             final byte[] bytes = strg.getBytes(Charsets.UTF_8);
113             output.write(UNCOMPRESSED_FLAG);
114             output.write(bytes, 0, bytes.length);
115             return output.toByteArray();
116         }
117 
118         final StringBuilder verb = new StringBuilder();
119 
120         final CharBuffer charBuffer = CharBuffer.wrap(strg);
121         int inlen;
122 
123         
124         while ((inlen = charBuffer.remaining()) > 0) {
125             int h1, h2, h3;
126             charBuffer.mark();
127             h1 = h2 = charBuffer.get() << 3;
128             if (inlen > 1) {
129                 h2 += charBuffer.get();
130             }
131             if (inlen > 2) {
132                 h3 = h2 ^ charBuffer.get();
133             } else {
134                 h3 = 0;
135             }
136             charBuffer.reset();
137 
138             int j = 7;
139             if (j > inlen) {
140                 j = inlen;
141             }
142 
143             boolean found = false;
144 
145             
146 
147 
148 
149             for (; j > 0; j--) {
150                 CharBuffer slot;
151                 if (j == 1) {
152                     slot = CharBuffer.wrap(CODEBOOK[h1 % 241]);
153                 } else if (j == 2) {
154                     slot = CharBuffer.wrap(CODEBOOK[h2 % 241]);
155                 } else {
156                     slot = CharBuffer.wrap(CODEBOOK[h3 % 241]);
157                 }
158 
159                 final int slotLength = slot.length();
160                 int slotIndex = 0;
161                 int slotEndIndex = slotIndex + j + 1;
162                 while (slotLength > 0 && slotEndIndex <= slotLength) {
163                     if (slot.get(slotIndex) == j
164                             && inlen >= j
165                             && slot.subSequence(slotIndex + 1, slotEndIndex).toString()
166                                     .equals(charBuffer.subSequence(0, j).toString())) {
167                         
168                         
169                         if (verb.length() > 0) {
170                             
171                             outputVerb(output, verb.toString());
172                             verb.setLength(0);
173                         }
174 
175                         
176                         
177                         output.write(slot.get(slot.get(slotIndex) + 1 + slotIndex));
178                         charBuffer.position(charBuffer.position() + j);
179                         inlen -= j;
180                         found = true;
181                         break;
182                     } else {
183                         slotIndex++;
184                         slotEndIndex = slotIndex + j + 1;
185                     }
186                 }
187             }
188 
189             
190             if (!found) {
191                 if (inlen > 0) {
192                     inlen--;
193                     verb.append(charBuffer.subSequence(0, 1).toString());
194                 }
195                 charBuffer.position(charBuffer.position() + 1);
196             }
197 
198             
199             
200             final int verbLength = verb.length();
201             if (verbLength == 255 || verbLength > 0 && inlen == 0) {
202                 outputVerb(output, verb.toString());
203                 verb.setLength(0);
204             }
205 
206         }
207         return output.toByteArray();
208     }
209 
210     
211 
212 
213 
214 
215 
216 
217     public static String decompress(final byte[] strBytes) {
218 
219         if (strBytes[0] == UNCOMPRESSED_FLAG) {
220             return new String(strBytes, 1, strBytes.length, Charsets.UTF_8);
221         }
222 
223         final StringBuilder out = new StringBuilder();
224 
225         for (int i = 0; i < strBytes.length; i++) {
226             final char b = (char) (0xFF & strBytes[i]);
227             if (b == 254) {
228                 out.append((char) strBytes[++i]);
229             } else if (b == 255) {
230                 final int length = 0xFF & strBytes[++i];
231                 for (int j = 1; j <= length; j++) {
232                     out.append((char) strBytes[i + j]);
233                 }
234                 i += length;
235             } else {
236                 final int loc = 0xFF & b;
237                 out.append(REVERSE_CODEBOOK[loc]);
238             }
239         }
240         return out.toString();
241     }
242 
243     private static boolean isOnlyAscii(final String input) {
244 
245         final char[] chars = input.toCharArray();
246 
247         for (final char c : chars) {
248             if (c <= 31 || c >= 127) {
249                 return false;
250             }
251         }
252 
253         return true;
254     }
255 
256     
257 
258 
259 
260 
261 
262     private static void outputVerb(final ByteArrayOutputStream baos, final String str) {
263         if (str.length() == 1) {
264             baos.write(254);
265             baos.write(str.toCharArray()[0]);
266         } else {
267             final byte[] bytes = str.getBytes(Charsets.UTF_8);
268             baos.write(255);
269             baos.write(str.length());
270             baos.write(bytes, 0, bytes.length);
271         }
272     }
273 
274     private Smaz() {
275     }
276 
277 }