1 package eu.fbk.knowledgestore.data;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 import java.io.ByteArrayOutputStream;
18 import java.io.IOException;
19 import java.nio.CharBuffer;
20
21 import com.google.common.base.Charsets;
22
23
24
25
26
27
28
29 final class Smaz {
30
31 private static final byte UNCOMPRESSED_FLAG = 1;
32
33
34 private static final String CODEBOOK[] = { "\002s,\266", "\003had\232\002leW", "\003on \216",
35 "", "\001yS", "\002ma\255\002li\227", "\003or \260", "", "\002ll\230\003s t\277",
36 "\004fromg\002mel", "", "\003its\332", "\001z\333", "\003ingF", "\001>\336",
37 "\001 \000\003 (\002nc\344", "\002nd=\003 on\312", "\002ne\213\003hat\276\003re q",
38 "", "\002ngT\003herz\004have\306\003s o\225", "", "\003ionk\003s a\254\002ly\352",
39 "\003hisL\003 inN\003 be\252", "", "\003 fo\325\003 of \003 ha\311", "", "\002of\005",
40 "\003 co\241\002no\267\003 ma\370", "", "", "\003 cl\356\003enta\003 an7",
41 "\002ns\300\001\"e", "\003n t\217\002ntP\003s, \205",
42 "\002pe\320\003 we\351\002om\223", "\002on\037", "", "\002y G", "\003 wa\271",
43 "\003 re\321\002or*", "", "\002=\"\251\002ot\337", "\003forD\002ou[", "\003 toR",
44 "\003 th\r", "\003 it\366", "\003but\261\002ra\202\003 wi\363\002</\361",
45 "\003 wh\237", "\002 4", "\003nd ?", "\002re!", "", "\003ng c", "",
46 "\003ly \307\003ass\323\001a\004\002rir", "", "", "", "\002se_", "\003of \"",
47 "\003div\364\002ros\003ere\240", "", "\002ta\310\001bZ\002si\324", "",
48 "\003and\u0007\002rs\335", "\002rt\362", "\002teE", "\003ati\316", "\002so\263",
49 "\002th\021", "\002tiJ\001c\034\003allp", "\003ate\345", "\002ss\246", "\002stM", "",
50 "\002><\346", "\002to\024", "\003arew", "\001d\030", "\002tr\303", "",
51 "\001\n1\003 a \222", "\003f tv\002veo", "\002un\340", "", "\003e o\242",
52 "\002a \243\002wa\326\001e\002", "\002ur\226\003e a\274", "\002us\244\003\n\r\n\247",
53 "\002ut\304\003e c\373", "\002we\221", "", "", "\002wh\302", "\001f,", "", "", "",
54 "\003d t\206", "", "", "\003th \343", "\001g;", "", "", "\001\r9\003e s\265",
55 "\003e t\234", "", "\003to Y", "\003e\r\n\236", "\002d \036\001h\022", "", "\001,Q",
56 "\002 a\031", "\002 b^", "\002\r\n\025\002 cI", "\002 d\245", "\002 e\253",
57 "\002 fh\001i\b\002e \013", "", "\002 hU\001-\314", "\002 i8", "", "", "\002 l\315",
58 "\002 m{", "\002f :\002 n\354", "\002 o\035", "\002 p}\001.n\003\r\n\r\250", "",
59 "\002 r\275", "\002 s>", "\002 t\016", "", "\002g \235\005which+\003whi\367",
60 "\002 w5", "\001/\305", "\003as \214", "\003at \207", "", "\003who\331", "",
61 "\001l\026\002h \212", "", "\002, $", "", "\004withV", "", "", "", "\001m-", "", "",
62 "\002ac\357", "\002ad\350", "\003TheH", "", "", "\004this\233\001n\t", "", "\002. y",
63 "", "\002alX\003e, \365", "\003tio\215\002be\\", "\002an\032\003ver\347", "",
64 "\004that0\003tha\313\001o\006", "\003was2", "\002arO", "\002as.",
65 "\002at'\003the\001\004they\200\005there\322\005theird", "\002ce\210", "\004were]",
66 "", "\002ch\231\002l \264\001p<", "", "", "\003one\256", "", "\003he \023\002dej",
67 "\003ter\270", "\002cou", "", "\002by\177\002di\201\002eax", "", "\002ec\327",
68 "\002edB", "\002ee\353", "", "", "\001r\f\002n )", "", "", "", "\002el\262", "",
69 "\003in i\002en3", "", "\002o `\001s\n", "", "\002er\033", "\003is t\002es6", "",
70 "\002ge\371", "\004.com\375", "\002fo\334\003our\330", "\003ch \301\001t\003",
71 "\002hab", "", "\003men\374", "", "\002he\020", "", "", "\001u&", "\002hif", "",
72 "\003not\204\002ic\203", "\003ed @\002id\355", "", "", "\002ho\273", "\002r K\001vm",
73 "", "", "", "\003t t\257\002il\360", "\002im\342", "\003en \317\002in\017",
74 "\002io\220", "\002s \027\001wA", "", "\003er |", "\003es ~\002is%", "\002it/", "",
75 "\002iv\272", "", "\002t #\u0007http://C\001x\372", "\002la\211", "\001<\341",
76 "\003, a\224" };
77
78
79 private static final String REVERSE_CODEBOOK[] = { " ", "the", "e", "t", "a", "of", "o",
80 "and", "i", "n", "s", "e ", "r", " th", " t", "in", "he", "th", "h", "he ", "to",
81 "\r\n", "l", "s ", "d", " a", "an", "er", "c", " o", "d ", "on", " of", "re", "of ",
82 "t ", ", ", "is", "u", "at", " ", "n ", "or", "which", "f", "m", "as", "it", "that",
83 "\n", "was", "en", " ", " w", "es", " an", " i", "\r", "f ", "g", "p", "nd", " s",
84 "nd ", "ed ", "w", "ed", "http://", "for", "te", "ing", "y ", "The", " c", "ti", "r ",
85 "his", "st", " in", "ar", "nt", ",", " to", "y", "ng", " h", "with", "le", "al",
86 "to ", "b", "ou", "be", "were", " b", "se", "o ", "ent", "ha", "ng ", "their", "\"",
87 "hi", "from", " f", "in ", "de", "ion", "me", "v", ".", "ve", "all", "re ", "ri",
88 "ro", "is ", "co", "f t", "are", "ea", ". ", "her", " m", "er ", " p", "es ", "by",
89 "they", "di", "ra", "ic", "not", "s, ", "d t", "at ", "ce", "la", "h ", "ne", "as ",
90 "tio", "on ", "n t", "io", "we", " a ", "om", ", a", "s o", "ur", "li", "ll", "ch",
91 "had", "this", "e t", "g ", "e\r\n", " wh", "ere", " co", "e o", "a ", "us", " d",
92 "ss", "\n\r\n", "\r\n\r", "=\"", " be", " e", "s a", "ma", "one", "t t", "or ", "but",
93 "el", "so", "l ", "e s", "s,", "no", "ter", " wa", "iv", "ho", "e a", " r", "hat",
94 "s t", "ns", "ch ", "wh", "tr", "ut", "/", "have", "ly ", "ta", " ha", " on", "tha",
95 "-", " l", "ati", "en ", "pe", " re", "there", "ass", "si", " fo", "wa", "ec", "our",
96 "who", "its", "z", "fo", "rs", ">", "ot", "un", "<", "im", "th ", "nc", "ate", "><",
97 "ver", "ad", " we", "ly", "ee", " n", "id", " cl", "ac", "il", "</", "rt", " wi",
98 "div", "e, ", " it", "whi", " ma", "ge", "x", "e c", "men", ".com" };
99
100
101
102
103
104
105
106
107 public static byte[] compress(final String strg) {
108
109 final ByteArrayOutputStream output = new ByteArrayOutputStream();
110
111 if (!isOnlyAscii(strg)) {
112 final byte[] bytes = strg.getBytes(Charsets.UTF_8);
113 output.write(UNCOMPRESSED_FLAG);
114 output.write(bytes, 0, bytes.length);
115 return output.toByteArray();
116 }
117
118 final StringBuilder verb = new StringBuilder();
119
120 final CharBuffer charBuffer = CharBuffer.wrap(strg);
121 int inlen;
122
123
124 while ((inlen = charBuffer.remaining()) > 0) {
125 int h1, h2, h3;
126 charBuffer.mark();
127 h1 = h2 = charBuffer.get() << 3;
128 if (inlen > 1) {
129 h2 += charBuffer.get();
130 }
131 if (inlen > 2) {
132 h3 = h2 ^ charBuffer.get();
133 } else {
134 h3 = 0;
135 }
136 charBuffer.reset();
137
138 int j = 7;
139 if (j > inlen) {
140 j = inlen;
141 }
142
143 boolean found = false;
144
145
146
147
148
149 for (; j > 0; j--) {
150 CharBuffer slot;
151 if (j == 1) {
152 slot = CharBuffer.wrap(CODEBOOK[h1 % 241]);
153 } else if (j == 2) {
154 slot = CharBuffer.wrap(CODEBOOK[h2 % 241]);
155 } else {
156 slot = CharBuffer.wrap(CODEBOOK[h3 % 241]);
157 }
158
159 final int slotLength = slot.length();
160 int slotIndex = 0;
161 int slotEndIndex = slotIndex + j + 1;
162 while (slotLength > 0 && slotEndIndex <= slotLength) {
163 if (slot.get(slotIndex) == j
164 && inlen >= j
165 && slot.subSequence(slotIndex + 1, slotEndIndex).toString()
166 .equals(charBuffer.subSequence(0, j).toString())) {
167
168
169 if (verb.length() > 0) {
170
171 outputVerb(output, verb.toString());
172 verb.setLength(0);
173 }
174
175
176
177 output.write(slot.get(slot.get(slotIndex) + 1 + slotIndex));
178 charBuffer.position(charBuffer.position() + j);
179 inlen -= j;
180 found = true;
181 break;
182 } else {
183 slotIndex++;
184 slotEndIndex = slotIndex + j + 1;
185 }
186 }
187 }
188
189
190 if (!found) {
191 if (inlen > 0) {
192 inlen--;
193 verb.append(charBuffer.subSequence(0, 1).toString());
194 }
195 charBuffer.position(charBuffer.position() + 1);
196 }
197
198
199
200 final int verbLength = verb.length();
201 if (verbLength == 255 || verbLength > 0 && inlen == 0) {
202 outputVerb(output, verb.toString());
203 verb.setLength(0);
204 }
205
206 }
207 return output.toByteArray();
208 }
209
210
211
212
213
214
215
216
217 public static String decompress(final byte[] strBytes) {
218
219 if (strBytes[0] == UNCOMPRESSED_FLAG) {
220 return new String(strBytes, 1, strBytes.length, Charsets.UTF_8);
221 }
222
223 final StringBuilder out = new StringBuilder();
224
225 for (int i = 0; i < strBytes.length; i++) {
226 final char b = (char) (0xFF & strBytes[i]);
227 if (b == 254) {
228 out.append((char) strBytes[++i]);
229 } else if (b == 255) {
230 final int length = 0xFF & strBytes[++i];
231 for (int j = 1; j <= length; j++) {
232 out.append((char) strBytes[i + j]);
233 }
234 i += length;
235 } else {
236 final int loc = 0xFF & b;
237 out.append(REVERSE_CODEBOOK[loc]);
238 }
239 }
240 return out.toString();
241 }
242
243 private static boolean isOnlyAscii(final String input) {
244
245 final char[] chars = input.toCharArray();
246
247 for (final char c : chars) {
248 if (c <= 31 || c >= 127) {
249 return false;
250 }
251 }
252
253 return true;
254 }
255
256
257
258
259
260
261
262 private static void outputVerb(final ByteArrayOutputStream baos, final String str) {
263 if (str.length() == 1) {
264 baos.write(254);
265 baos.write(str.toCharArray()[0]);
266 } else {
267 final byte[] bytes = str.getBytes(Charsets.UTF_8);
268 baos.write(255);
269 baos.write(str.length());
270 baos.write(bytes, 0, bytes.length);
271 }
272 }
273
274 private Smaz() {
275 }
276
277 }