1 package eu.fbk.knowledgestore.populator.naf;
2
3 import com.google.common.io.ByteStreams;
4 import eu.fbk.knowledgestore.data.Data;
5 import eu.fbk.knowledgestore.data.Record;
6 import eu.fbk.knowledgestore.populator.naf.model.*;
7 import eu.fbk.knowledgestore.vocabulary.KS;
8 import eu.fbk.knowledgestore.vocabulary.NIF;
9 import eu.fbk.knowledgestore.vocabulary.NWR;
10 import eu.fbk.rdfpro.util.IO;
11 import org.openrdf.model.URI;
12 import org.openrdf.model.impl.URIImpl;
13 import org.openrdf.model.impl.ValueFactoryImpl;
14 import org.openrdf.model.vocabulary.DCTERMS;
15 import org.openrdf.model.vocabulary.RDF;
16 import org.slf4j.Logger;
17
18 import javax.xml.bind.JAXBContext;
19 import javax.xml.bind.JAXBException;
20 import javax.xml.bind.Unmarshaller;
21 import java.io.*;
22 import java.net.URL;
23 import java.nio.file.Path;
24 import java.nio.file.Paths;
25 import java.util.*;
26
27 public class processNAF {
28
29 public static void main(String[] args) throws Exception {
30
31 String disabled_Items = "", path = "";
32 if (args.length > 0) {
33 path = args[0];
34 if (args.length > 1) {
35 disabled_Items = args[1];
36 }
37 } else {
38 System.err
39 .println(
40 "eu.fbk.knowledgestore.populator.naf.processNAF path disabled_items \ndisabled_items = [Entities|Mentions|Resources] ");
41 throw new Exception();
42
43 }
44 processNAFVariables vars = new processNAFVariables();
45
46 analyzePathAndRunSystem(path, disabled_Items, vars);
47 }
48
49 public static KSPresentation init(String fPath, Writer inout, String disabled_Items,
50 boolean store_partical_info) throws JAXBException, IOException {
51 processNAFVariables vars = new processNAFVariables();
52 vars.storePartialInforInCaseOfError = store_partical_info;
53 vars.out = inout;
54 statistics stat;
55 try {
56 stat = readFile(fPath, disabled_Items, vars);
57
58 KSPresentation returned = new KSPresentation();
59 returned.setNaf_file_path(fPath);
60 returned.setNews(vars.rawText);
61 returned.setMentions(vars.mentionListHash);
62 returned.setNaf(vars.nafFile2);
63 returned.setNewsResource(vars.newsFile2);
64 returned.setStats(stat);
65 return returned;
66 } catch (Exception e) {
67 e.printStackTrace();
68 return null;
69 }
70 }
71
72 private static void analyzePathAndRunSystem(String path, String disabled_Items, processNAFVariables vars)
73 throws Exception {
74 vars.filePath = new File(path);
75 if (vars.filePath.exists() && vars.filePath.isDirectory()) {
76
77 vars.out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(
78 vars.filePath.getPath(), "report.txt")), "utf-8"));
79 File[] listOfFiles = vars.filePath.listFiles();
80 for (int i = 0; i < listOfFiles.length; i++) {
81 if (listOfFiles[i].isFile() && listOfFiles[i].getName().endsWith(".naf")) {
82 System.err.println(i + "=" + listOfFiles[i].getName());
83 vars.out.append("\n" + i + "=" + listOfFiles[i].getName() + "\n");
84 readFile(listOfFiles[i].getPath(), disabled_Items, vars);
85 }
86 vars.out.flush();
87 System.gc();
88 Runtime.getRuntime().gc();
89 }
90 } else if (vars.filePath.exists() && vars.filePath.isFile()) {
91
92 vars.out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(
93 vars.filePath.getPath() + ".report.txt")), "utf-8"));
94 vars.out.append(vars.filePath.getPath() + "\n");
95 readFile(vars.filePath.getPath(), disabled_Items, vars);
96 }
97 vars.out.flush();
98 vars.out.close();
99 }
100
101 public static statistics readFile(String filepath, String disabled_Items, processNAFVariables vars)
102 throws Exception {
103 vars.storePartialInforInCaseOfError = true;
104 vars.filePath = new File(filepath);
105 logDebug("Start working with (" + vars.filePath.getName() + ")", vars);
106 String disabledItems = "";
107 if (disabled_Items != null
108 && (disabled_Items.matches("(?i)Entity") || disabled_Items.contains("(?i)Mention") || disabled_Items
109 .contains("(?i)Resource"))) {
110 disabledItems = disabled_Items;
111 logDebug("Disable layer: " + disabledItems, vars);
112 }
113 readNAFFile(vars.filePath, vars);
114 NafHeader header = vars.doc.getNafHeader();
115 getNAFHEADERMentions(header, vars);
116 Raw raw = vars.doc.getRaw();
117 if (raw != null) {
118 vars.rawText = raw.getvalue();
119 vars.globalText = vars.doc.getText();
120 vars.globalTerms = vars.doc.getTerms();
121 getEntitiesMentions(vars.doc.getEntities(), disabledItems, vars);
122 getCoreferencesMentions(vars.doc.getCoreferences(), vars);
123 getTimeExpressionsMentions(vars.doc.getTimeExpressions(), vars);
124
125 getFactualityMentionsV3(vars.doc.getFactualities(), vars);
126 getSRLMentions(vars);
127 getCLinksMentions(vars.doc.getCausalRelations(), vars);
128 getTLinksMentions(vars.doc.getTemporalRelations(), vars);
129
130
131 fixMentions(vars);
132 }
133
134 logDebug("End of NAF populating.", vars);
135 statistics st = new statistics();
136 st.setObjectMention((vars.corefMention2 + vars.entityMen2));
137 st.setPER(vars.PER);
138 st.setORG(vars.ORG);
139 st.setLOC(vars.LOC);
140 st.setFin(vars.fin);
141 st.setMix(vars.mix);
142 st.setPRO(vars.PRO);
143 st.setNo_mapping(vars.no_mapping);
144 st.setTimeMention(vars.timeMention2);
145 st.setEventMention((vars.factualityMentions2 + vars.srlMention2));
146 st.setParticipationMention(vars.rolewithEntity2);
147 st.setEntity(vars.entityMen);
148 st.setCoref(vars.corefMention);
149 st.setCorefMentionEvent(vars.corefMentionEvent);
150 st.setCorefMentionNotEvent(vars.corefMentionNotEvent);
151 st.setFactuality(vars.factualityMentions);
152 st.setRole(vars.roleMentions);
153 st.setRolewithEntity(vars.rolewithEntity);
154 st.setRolewithoutEntity(vars.rolewithoutEntity);
155 st.setSrl(vars.srlMention);
156 st.setTimex(vars.timeMention);
157 st.setClinkMention(vars.clinkMentions);
158 st.setClinkMentionDiscarded(vars.clinkMentionsDiscarded);
159 st.setTlinkMention(vars.tlinkMentions);
160 st.setTlinkMentionsEnriched(vars.tlinkMentionsEnriched);
161 st.setTlinkMentionDiscarded(vars.tlinkMentionsDiscarded);
162 logDebug(st.getStats(), vars);
163 return st;
164 }
165
166 private static void getEntitiesMentions(Entities obj, String disabledItems, processNAFVariables vars)
167 throws Exception {
168 if (!checkHeaderTextTerms(vars)) {
169 logError("Error: populating stopped", vars);
170 } else {
171 logDebug("Start mapping the Entities mentions:", vars);
172 }
173 for (Entity entObj : ((Entities) obj).getEntity()) {
174 String deg = "";
175 int referencesElements = 0;
176 String charS = null;
177
178
179
180 for (Object generalEntObj : entObj.getReferencesOrExternalReferences()) {
181 if (generalEntObj instanceof References) {
182
183
184
185 referencesElements++;
186 if (((References) generalEntObj).getSpan().size() < 1) {
187 logDebug("Every entity must contain a 'span' element inside 'references'", vars);
188 }
189 if (((References) generalEntObj).getSpan().size() > 1) {
190 logDebug("xpath(///NAF/entities/entity/references/span/), spanSize("
191 + ((References) generalEntObj).getSpan().size()
192 + ") Every entity must contain a unique 'span' element inside 'references'", vars);
193 }
194 for (Span spansObj : ((References) generalEntObj).getSpan()) {
195 boolean addMentionFlag = true;
196
197 if (spansObj.getTarget().size() < 1) {
198 addMentionFlag = false;
199 logDebug("Every span in an entity must contain at least one target inside", vars);
200 continue;
201 }
202
203 Record m = Record.create();
204 deg += "RDF.TYPE:OBJECT_MENTION,ENTITY_MENTION,MENTION";
205 m.add(RDF.TYPE, NWR.OBJECT_MENTION, NWR.ENTITY_MENTION, KS.MENTION);
206 deg = "MENTION_OF:" + vars.news_file_id.stringValue() + "|" + deg;
207 m.add(KS.MENTION_OF, vars.news_file_id);
208
209 if (((References) generalEntObj).getSpan().size() > 1) {
210 m.add(NWR.LOCAL_COREF_ID, entObj.getId());
211 deg += "|LOCAL_COREF_ID:" + entObj.getId();
212 }
213 generateTheMIdAndSetID(spansObj, m, vars);
214 charS = m.getUnique(NIF.BEGIN_INDEX, Integer.class) + "," + m
215 .getUnique(NIF.END_INDEX, Integer.class);
216 deg = "MentionId:" + m.getID() + "|" + deg;
217
218
219
220
221
222 boolean keepEntityTypeProvidedByNaf = true, dbpedia = true;
223 String type3charLC = "";
224 if (!dbpedia) {
225 if (entObj.getType() != null && entObj.getType() != "" && !entObj.getType()
226 .equalsIgnoreCase("misc")) {
227 type3charLC = entObj.getType().substring(0, 3).toLowerCase();
228 } else {
229 type3charLC = "misc";
230 entObj.setType("misc");
231 }
232 if (keepEntityTypeProvidedByNaf) {
233 URI dynamicTypeUri = ValueFactoryImpl.getInstance()
234 .createURI(NWR.NAMESPACE, "entity_type_" + entObj.getType().toLowerCase());
235 m.add(NWR.ENTITY_TYPE, dynamicTypeUri);
236 deg += "|ENTITY_TYPE:" + dynamicTypeUri;
237 logDebug("ROL1: <entity> added new mention for id " + entObj.getId() + ", charSpan |"
238 + getCharSpanFromSpan(spansObj, vars) + "|, type " + dynamicTypeUri, vars);
239 } else {
240 if (vars.entityTypeMapper.containsKey(type3charLC)
241 && vars.entityTypeMapper.get(type3charLC) != null) {
242 m.add(NWR.ENTITY_TYPE, vars.entityTypeMapper.get(type3charLC));
243 deg += "|ENTITY_TYPE:" + vars.entityTypeMapper.get(type3charLC);
244 logDebug("ROL1: <entity> STRANGE added new mention for id " + entObj.getId()
245 + ", charSpan |"
246 + getCharSpanFromSpan(spansObj, vars) + "|, type " + vars.entityTypeMapper
247 .get(type3charLC), vars);
248 } else {
249 addMentionFlag = false;
250 logDebug("xpath(//NAF/entities/entity/@type),type(" + entObj.getType() + "), id("
251 + entObj.getId() + ") NO mapping for it", vars);
252 vars.no_mapping++;
253 }
254 }
255 }
256 else {
257 if (entObj.getType() == null || entObj.getType().equals("")) {
258 type3charLC = "misc";
259 entObj.setType("misc");
260 } else {
261 if (entObj.getType().toLowerCase().contains("per") || entObj.getType().toLowerCase()
262 .contains("dbpedia:person")) {
263 entObj.setType("person");
264 type3charLC = entObj.getType().substring(0, 3).toLowerCase();
265 } else if (entObj.getType().toLowerCase().contains("org") || entObj.getType()
266 .toLowerCase().contains("dbpedia:organisation")) {
267 entObj.setType("organization");
268 type3charLC = entObj.getType().substring(0, 3).toLowerCase();
269 } else if (entObj.getType().toLowerCase().contains("loc") || entObj.getType()
270 .toLowerCase().contains("DBpedia:Place")) {
271 entObj.setType("location");
272 type3charLC = entObj.getType().substring(0, 3).toLowerCase();
273 } else {
274 entObj.setType("misc");
275 type3charLC = "misc";
276 }
277 }
278 URI dynamicTypeUri = ValueFactoryImpl.getInstance()
279 .createURI(NWR.NAMESPACE, "entity_type_" + entObj.getType().toLowerCase());
280 m.add(NWR.ENTITY_TYPE, dynamicTypeUri);
281 deg += "|ENTITY_TYPE:" + dynamicTypeUri;
282 logDebug("ROL1: <entity> added new mention for id " + entObj.getId() + ", charSpan |"
283 + getCharSpanFromSpan(spansObj, vars) + "|, type " + dynamicTypeUri, vars);
284
285 }
286 if (addMentionFlag) {
287 if (addOrMergeAMention(m, vars) == 1) {
288 String charS2 = m.getUnique(NIF.BEGIN_INDEX, Integer.class) + "," + m
289 .getUnique(NIF.END_INDEX, Integer.class);
290 vars.entityMentions.put(charS2, m);
291 if (type3charLC.equalsIgnoreCase("PER")) {
292 vars.PER++;
293 }
294 if (type3charLC.equalsIgnoreCase("LOC")) {
295 vars.LOC++;
296 }
297 if (type3charLC.equalsIgnoreCase("ORG")) {
298 vars.ORG++;
299 }
300 if (type3charLC.equalsIgnoreCase("PRO")) {
301 vars.PRO++;
302 }
303 if (type3charLC.equalsIgnoreCase("fin")) {
304 vars.fin++;
305 }
306 if (type3charLC.equalsIgnoreCase("mix") || type3charLC.equalsIgnoreCase("misc")) {
307 vars.mix++;
308 }
309 vars.entityMen2++;
310 vars.entityMen++;
311 }
312 }
313 }
314 } else if (generalEntObj instanceof ExternalReferences) {
315
316
317
318
319
320
321
322 List<ExternalRef> externalRefs = ((ExternalReferences) generalEntObj).getExternalRef();
323
324
325
326
327
328
329
330
331
332 if (!disabledItems.matches("(?i)Entity")) {
333 boolean firstTimeFlag = true;
334 String chosenReferenceValue = null;
335
336
337
338 boolean modeactive = true;
339
340
341 for (ExternalRef exRObj : externalRefs) {
342 if (exRObj.getSource() == null) {
343 exRObj.setSource("en");
344 }
345
346 if (exRObj.getSource().equalsIgnoreCase("POCUS")) {
347 if (referencesElements < 1) {
348 logDebug(
349 "Every entity must contain a 'references' element:not possible to add ExternalRef to null.",
350 vars);
351 continue;
352 }
353
354 String referenceValue = exRObj.getReference();
355 chosenReferenceValue = new String(referenceValue);
356 modeactive = false;
357 }
358 }
359
360 if (modeactive) {
361 LinkedList<ExternalRef> exrEn = new LinkedList<ExternalRef>();
362 LinkedList<ExternalRef> exrEs = new LinkedList<ExternalRef>();
363 LinkedList<ExternalRef> exrIt = new LinkedList<ExternalRef>();
364 LinkedList<ExternalRef> exrNl = new LinkedList<ExternalRef>();
365
366 for (ExternalRef exRObj : externalRefs) {
367 if (exRObj != null) {
368 getAllLayersOfExternalReferences(modeactive, exrEn, exRObj, exrEs, exrIt, exrNl,
369 vars);
370 }
371 }
372
373 String highConfidenceReferenceValue = null;
374 if (exrEn.size() > 0) {
375 highConfidenceReferenceValue = getHighConfidenceReferenceValue(exrEn);
376 } else if (exrEs.size() > 0) {
377 highConfidenceReferenceValue = getHighConfidenceReferenceValue(exrEs);
378 } else if (exrIt.size() > 0) {
379 highConfidenceReferenceValue = getHighConfidenceReferenceValue(exrIt);
380 } else if (exrNl.size() > 0) {
381 highConfidenceReferenceValue = getHighConfidenceReferenceValue(exrNl);
382 }
383
384 if (highConfidenceReferenceValue != null) {
385 chosenReferenceValue = new String(highConfidenceReferenceValue);
386 }
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408 }
409
410 if (chosenReferenceValue != null) {
411 URIImpl chosenReferenceURI = new URIImpl(chosenReferenceValue);
412 if (charS != null && vars.mentionListHash.get(charS) != null
413 && vars.mentionListHash.get(charS).get(KS.REFERS_TO).size() == 0) {
414 vars.mentionListHash.get(charS).add(KS.REFERS_TO, chosenReferenceURI);
415
416 deg += "|REFERS_TO:" + chosenReferenceValue;
417 vars.entityMentions.get(charS).add(KS.REFERS_TO, chosenReferenceURI);
418 } else {
419 if (charS != null && vars.mentionListHash.get(charS) != null) {
420 deg += "|REFERS_TO:" + vars.mentionListHash.get(charS).get(KS.REFERS_TO);
421 }
422
423 }
424 }
425 }
426 }
427
428 }
429
430 logDebug(deg, vars);
431 if (referencesElements < 1) {
432 logDebug("Every entity must contain a 'references' element", vars);
433 }
434 }
435 }
436
437 private static void getAllLayersOfExternalReferences(boolean modeactive, LinkedList<ExternalRef> exrEn,
438 ExternalRef exRObj, LinkedList<ExternalRef> exrEs, LinkedList<ExternalRef> exrIt,
439 LinkedList<ExternalRef> exrNl, processNAFVariables vars) {
440
441 if (modeactive) {
442 if (exRObj.getReftype() != null) {
443 switch (exRObj.getReftype()) {
444 case "en":
445 exrEn.addLast(exRObj);
446 break;
447 case "es":
448 exrEs.addLast(exRObj);
449 break;
450 case "it":
451 exrIt.addLast(exRObj);
452 break;
453 case "nl":
454 exrNl.addLast(exRObj);
455 break;
456 }
457 } else if (exRObj.getReference().contains("dbpedia")) {
458
459 if (exRObj.getReference().contains("://dbpedia.org")) {
460 exrEn.addLast(exRObj);
461 } else if (exRObj.getReference().contains("://es.dbpedia.org")) {
462 exrEs.addLast(exRObj);
463 } else if (exRObj.getReference().contains("://it.dbpedia.org")) {
464 exrIt.addLast(exRObj);
465 } else if (exRObj.getReference().contains("://nl.dbpedia.org")) {
466 exrNl.addLast(exRObj);
467 }
468 } else {
469 logDebug(
470 "Every entity must contain a 'references' element with type or DBpedia reference: not possible to add ExternalRef to null.",
471 vars);
472 }
473 }
474
475 if (exRObj != null && exRObj.getExternalRef() != null && exRObj.getExternalRef().size() > 0) {
476 for (ExternalRef ff : exRObj.getExternalRef()) {
477 getAllLayersOfExternalReferences(modeactive, exrEn, ff, exrEs, exrIt, exrNl, vars);
478 }
479 } else {
480 return;
481 }
482 }
483
484 private static String getHighConfidenceReferenceValue(
485 LinkedList<ExternalRef> exrl) {
486 ExternalRef current = null;
487 Double dt = 0.0;
488 for (ExternalRef tmp : exrl) {
489 Double co = Double.parseDouble(tmp.getConfidence());
490 if (current == null) {
491 current = tmp;
492 dt = co;
493 } else {
494 if (co > dt) {
495 current = tmp;
496 dt = co;
497 }
498 }
499 }
500 if (current == null) {
501 return null;
502 }
503
504 return current.getReference();
505 }
506
507 private static void getTimeExpressionsMentions(TimeExpressions obj, processNAFVariables vars) throws Exception {
508 if (!checkHeaderTextTerms(vars)) {
509 logError("Error: populating interrupted", vars);
510 } else {
511 logDebug("Start mapping the TimeExpressions mentions:", vars);
512 }
513 for (Timex3 tmxObj : ((TimeExpressions) obj).getTimex3()) {
514
515 Span tmxSpan = tmxObj.getSpan();
516 String tmxTypeUC = tmxObj.getType().toUpperCase();
517 boolean keepTimeTypeProvidedByNaf = true;
518
519
520 if ((tmxSpan == null) || (tmxSpan.getTarget().size() < 1)) {
521 logDebug("skipping timex3 without span, id is " + tmxObj.getId(), vars);
522 continue;
523 }
524
525 String deg = "";
526 Record m = Record.create();
527 m.add(RDF.TYPE, NWR.TIME_MENTION, NWR.TIME_OR_EVENT_MENTION, NWR.ENTITY_MENTION,
528 KS.MENTION);
529 deg += "|TYPE:TIME_MENTION,TIME_OR_EVENT_MENTION,ENTITY_MENTION,MENTION";
530 m.add(KS.MENTION_OF, vars.news_file_id);
531 LinkedList<Wf> wordsL = fromSpanGetAllMentionsTmx(((Span) tmxSpan).getTarget(), vars);
532 generateMIDAndSetIdWF(wordsL, m, vars);
533 deg = "MentionId:" + m.getID() + deg;
534
535 if (keepTimeTypeProvidedByNaf) {
536 URI dynamicTypeUri = ValueFactoryImpl.getInstance()
537 .createURI(NWR.NAMESPACE, "timex3_" + tmxTypeUC.toLowerCase());
538 m.add(NWR.TIME_TYPE, dynamicTypeUri);
539 deg += "|TIME_TYPE:" + dynamicTypeUri;
540 logDebug("ROL1: <timex3> added new mention for id " + tmxObj.getId() + ", type " + dynamicTypeUri,
541 vars);
542 } else {
543 if (vars.timex3TypeMapper.containsKey(tmxTypeUC)
544 && vars.timex3TypeMapper.get(tmxTypeUC) != null) {
545 m.add(NWR.TIME_TYPE, vars.timex3TypeMapper.get(tmxTypeUC));
546 deg += "|TIME_TYPE:" + vars.timex3TypeMapper.get(tmxTypeUC);
547 logDebug("ROL1: <timex3> STRANGE added new mention for id " + tmxObj.getId()
548 + ", type " + vars.timex3TypeMapper.get(tmxTypeUC), vars);
549 } else {
550 logDebug("xpath(//NAF/timeExpressions/timex3/@type), type(" + tmxTypeUC
551 + "), No mapping.", vars);
552 }
553 }
554 if (false && tmxObj.getBeginPoint() != null) {
555 m.add(NWR.BEGIN_POINT, tmxObj.getBeginPoint());
556 deg += "|BEGIN_POINT:" + tmxObj.getBeginPoint();
557 }
558 if (false && tmxObj.getEndPoint() != null) {
559 m.add(NWR.END_POINT, tmxObj.getEndPoint());
560 deg += "|END_POINT:" + tmxObj.getEndPoint();
561 }
562 if (tmxObj.getQuant() != null) {
563 m.add(NWR.QUANT, tmxObj.getQuant());
564 deg += "|QUANT:" + tmxObj.getQuant();
565 }
566 if (tmxObj.getFreq() != null) {
567 m.add(NWR.FREQ, tmxObj.getFreq());
568 deg += "|FREQ:" + tmxObj.getFreq();
569 }
570 if (tmxObj.getFunctionInDocument() != null) {
571 m.add(NWR.FUNCTION_IN_DOCUMENT, tmxObj.getFunctionInDocument());
572 deg += "|FUNCTION_IN_DOCUMENT:" + tmxObj.getFunctionInDocument();
573 }
574 if (tmxObj.getTemporalFunction() != null) {
575 m.add(NWR.TEMPORAL_FUNCTION, tmxObj.getTemporalFunction());
576 deg += "|TEMPORAL_FUNCTION:" + tmxObj.getTemporalFunction();
577 }
578 if (tmxObj.getValue() != null) {
579 m.add(NWR.VALUE, tmxObj.getValue());
580 deg += "|VALUE:" + tmxObj.getValue();
581 }
582 if (tmxObj.getValueFromFunction() != null) {
583 m.add(NWR.VALUE_FROM_FUNCTION, tmxObj.getValueFromFunction());
584 deg += "|VALUE_FROM_FUNCTION:" + tmxObj.getValueFromFunction();
585 }
586 if (tmxObj.getMod() != null) {
587 m.add(NWR.MOD, tmxObj.getMod());
588 deg += "|MOD:" + tmxObj.getMod();
589 }
590 if (tmxObj.getAnchorTimeID() != null) {
591 m.add(NWR.ANCHOR_TIME, tmxObj.getAnchorTimeID());
592 deg += "|ANCHOR_TIME:" + tmxObj.getAnchorTimeID();
593 }
594 logDebug(deg, vars);
595 int addedNew = addOrMergeAMention(m, vars);
596 if (addedNew == 1) {
597 vars.timeMention2++;
598 vars.timeMention++;
599 }
600 String charS2 =
601 m.getUnique(NIF.BEGIN_INDEX, Integer.class) + "," + m.getUnique(NIF.END_INDEX, Integer.class);
602 vars.entityMentions.put(charS2, m);
603 }
604 }
605
606 private static void getFactualityMentions(Factualitylayer obj, processNAFVariables vars) throws Exception {
607 if (!checkHeaderTextTerms(vars)) {
608 logError("Error: populating interrupted", vars);
609 } else {
610 logDebug("Start mapping the Factuality mentions:", vars);
611 }
612 for (Factvalue fvObj : ((Factualitylayer) obj).getFactvalue()) {
613 String deg = "";
614 Record m = Record.create();
615 m.add(RDF.TYPE, NWR.EVENT_MENTION, NWR.TIME_OR_EVENT_MENTION, NWR.ENTITY_MENTION,
616 KS.MENTION);
617 deg += "|TYPE:EVENT_MENTION,TIME_OR_EVENT_MENTION,ENTITY_MENTION,MENTION";
618 m.add(KS.MENTION_OF, vars.news_file_id);
619 LinkedList<Target> tarlist = new LinkedList<Target>();
620 Target tmp = new Target();
621 tmp.setId(fvObj.getId());
622 tarlist.addLast(tmp);
623 LinkedList<Wf> wordsL = fromSpanGetAllMentionsTmx(tarlist, vars);
624 generateMIDAndSetIdWF(wordsL, m, vars);
625 deg = "MentionId:" + m.getID() + deg;
626
627
628
629 if (fvObj.getConfidence() != null) {
630 m.add(NWR.FACTUALITY_CONFIDENCE, fvObj.getConfidence());
631 deg += "|FACTUALITY_CONFIDENCE:" + fvObj.getConfidence();
632 }
633 logDebug(deg, vars);
634 int addedNew = addOrMergeAMention(m, vars);
635 if (addedNew == 1) {
636 vars.factualityMentions2++;
637 vars.factualityMentions++;
638 }
639 String charS2 =
640 m.getUnique(NIF.BEGIN_INDEX, Integer.class) + "," + m.getUnique(NIF.END_INDEX, Integer.class);
641 vars.entityMentions.put(charS2, m);
642
643 }
644
645 }
646
647 private static void getFactualityMentionsV3(Factualities factualities, processNAFVariables vars) throws Exception {
648 if (!checkHeaderTextTerms(vars)) {
649 logError("Error: populating interrupted", vars);
650 } else {
651 logDebug("Start mapping the Factualities mentions:", vars);
652 }
653 for (Factuality fvObj : factualities.getFactuality()) {
654
655 String deg = "";
656 Record m = Record.create();
657 m.add(RDF.TYPE, NWR.EVENT_MENTION, NWR.TIME_OR_EVENT_MENTION, NWR.ENTITY_MENTION,
658 KS.MENTION);
659 deg += "|TYPE:EVENT_MENTION,TIME_OR_EVENT_MENTION,ENTITY_MENTION,MENTION";
660 m.add(KS.MENTION_OF, vars.news_file_id);
661 LinkedList<Target> tarlist = new LinkedList<Target>();
662 for (Target t : fvObj.getSpan().getTarget()) {
663 tarlist.addLast(t);
664 }
665 LinkedList<Wf> wordsL = fromSpanGetAllMentions(tarlist, vars);
666 generateMIDAndSetIdWF(wordsL, m, vars);
667 deg = "MentionId:" + m.getID() + deg;
668
669 for (FactVal tts : fvObj.getFactVal()) {
670 if (tts.getResource().equalsIgnoreCase("factbank")) {
671 m.add(NWR.FACT_BANK, tts.getValue());
672 }
673 }
674 logDebug(deg, vars);
675 int addedNew = addOrMergeAMention(m, vars);
676 if (addedNew == 1 | addedNew == 0) {
677 vars.factualityMentions2++;
678 vars.factualityMentions++;
679 }
680 String charS2 =
681 m.getUnique(NIF.BEGIN_INDEX, Integer.class) + "," + m.getUnique(NIF.END_INDEX, Integer.class);
682 vars.entityMentions.put(charS2, m);
683
684 }
685 }
686
687 private static void getCLinksMentions(CausalRelations causalRelations, processNAFVariables vars) throws Exception {
688 if (!checkHeaderTextTerms(vars)) {
689 logError("Error: populating interrupted", vars);
690 } else {
691 logDebug("Start mapping the CLINKS mentions:", vars);
692 }
693 for (Clink fvObj : causalRelations.getClink()) {
694
695 String deg = "";
696 Record m = Record.create();
697 m.add(RDF.TYPE, NWR.CLINK, NWR.RELATION_MENTION,
698 KS.MENTION);
699 deg += "|TYPE:CLINK,RELATION_MENTION,MENTION";
700 m.add(KS.MENTION_OF, vars.news_file_id);
701
702 LinkedList<Target> tarlist = new LinkedList<Target>();
703 List<Target> from = getSpanTermsOfPredicate(fvObj.getFrom(), vars);
704 List<Target> to = getSpanTermsOfPredicate(fvObj.getTo(), vars);
705 tarlist.addAll(from);
706 tarlist.addAll(to);
707
708 LinkedList<Wf> fromwl = fromSpanGetAllMentions(from, vars);
709 Record mtest1 = Record.create();
710 generateMIDAndSetIdWF(fromwl, mtest1, vars);
711 m.add(NWR.SOURCE, mtest1.getID());
712
713 LinkedList<Wf> towl = fromSpanGetAllMentions(to, vars);
714 Record mtest2 = Record.create();
715 generateMIDAndSetIdWF(towl, mtest2, vars);
716 m.add(NWR.TARGET, mtest2.getID());
717
718 LinkedList<Wf> wordsL = fromSpanGetAllMentions(tarlist, vars);
719 generateMIDAndSetIdWF(wordsL, m, vars);
720 deg = "MentionId:" + m.getID() + deg;
721
722 logDebug(deg, vars);
723 int addedNew = addOrMergeAMention(m, vars);
724 if (addedNew == 1) {
725 vars.clinkMentions++;
726 } else if (addedNew == 1) {
727 vars.clinkMentionsDiscarded++;
728 }
729
730 }
731 }
732
733 private static void getTLinksMentions(TemporalRelations temporalRelations, processNAFVariables vars)
734 throws Exception {
735 if (!checkHeaderTextTerms(vars)) {
736 logError("Error: populating interrupted", vars);
737 } else {
738 logDebug("Start mapping the TLINKS mentions:", vars);
739 }
740
741 if (temporalRelations == null) {
742 return;
743 }
744 if (temporalRelations.getTlink() == null) {
745 return;
746 }
747
748 for (Tlink fvObj : temporalRelations.getTlink()) {
749
750 String deg = "";
751 Record m = Record.create();
752 m.add(RDF.TYPE, NWR.TLINK, NWR.RELATION_MENTION,
753 KS.MENTION);
754 deg += "|TYPE:TLINK,RELATION_MENTION,MENTION";
755 m.add(KS.MENTION_OF, vars.news_file_id);
756
757 LinkedList<Target> tarlist = new LinkedList<Target>();
758 List<Target> from = new ArrayList<Target>();
759 List<Target> to = new ArrayList<Target>();
760 if (fvObj.getFromType().equalsIgnoreCase("event")) {
761 from = getSpanTermsOfPredicate(fvObj.getFrom(), vars);
762 }
763
764 if (fvObj.getToType().equalsIgnoreCase("event")) {
765 to = getSpanTermsOfPredicate(fvObj.getTo(), vars);
766 }
767
768 tarlist.addAll(from);
769 tarlist.addAll(to);
770
771 LinkedList<Wf> allEventWF = fromSpanGetAllMentions(tarlist, vars);
772 LinkedList<Wf> allWFFrom = fromSpanGetAllMentions(from, vars);
773 LinkedList<Wf> allWFTO = fromSpanGetAllMentions(to, vars);
774
775 List<Wf> fromtmx = new ArrayList<Wf>();
776 List<Wf> totmx = new ArrayList<Wf>();
777
778 if (fvObj.getFromType().equalsIgnoreCase("timex")) {
779 fromtmx = getSpanOfTimex(fvObj.getFrom(), vars);
780 allWFFrom.addAll(fromtmx);
781 allEventWF.addAll(fromtmx);
782 }
783
784 if (fvObj.getToType().equalsIgnoreCase("timex")) {
785 totmx = getSpanOfTimex(fvObj.getTo(), vars);
786 allWFTO.addAll(totmx);
787 allEventWF.addAll(totmx);
788 }
789
790
791 allEventWF = reorderWFAscending(allEventWF, vars);
792 allWFFrom = reorderWFAscending(allWFFrom, vars);
793 allWFTO = reorderWFAscending(allWFTO, vars);
794
795 if (fvObj.getFrom().equalsIgnoreCase("tmx0")) {
796 Record mtest2 = Record.create();
797 generateMIDAndSetIdWF(allWFTO, mtest2, vars);
798 int returned = addTlinkRelTypeToMention(NWR.TLINK_FROM_TMX0,
799 vars.tLinkTypeMapper.get(fvObj.getRelType().toUpperCase()), mtest2, vars);
800 if (returned == 1) {
801 vars.tlinkMentionsEnriched++;
802 } else {
803 logDebug("Tlink FROM -> tmx0, not found the target mention id:" + fvObj.getTo(), vars);
804 }
805 continue;
806 }
807 if (fvObj.getTo().equalsIgnoreCase("tmx0")) {
808 Record mtest1 = Record.create();
809 generateMIDAndSetIdWF(allWFFrom, mtest1, vars);
810 int returned = addTlinkRelTypeToMention(NWR.TLINK_TO_TMX0,
811 vars.tLinkTypeMapper.get(fvObj.getRelType().toUpperCase()), mtest1, vars);
812 if (returned == 1) {
813 vars.tlinkMentionsEnriched++;
814 } else {
815 logDebug("Tlink TO -> tmx0, not found the source mention id:" + fvObj.getFrom(), vars);
816 }
817 continue;
818 }
819 if (allWFFrom.size() > 0) {
820 Record mtest1 = Record.create();
821 generateMIDAndSetIdWF(allWFFrom, mtest1, vars);
822 m.add(NWR.SOURCE, mtest1.getID());
823 }
824 if (allWFTO.size() > 0) {
825 Record mtest2 = Record.create();
826 generateMIDAndSetIdWF(allWFTO, mtest2, vars);
827 m.add(NWR.TARGET, mtest2.getID());
828 }
829 m.add(NWR.REL_TYPE, vars.tLinkTypeMapper.get(fvObj.getRelType().toUpperCase()));
830 generateMIDAndSetIdWF(allEventWF, m, vars);
831 deg = "MentionId:" + m.getID() + deg;
832
833 logDebug(deg, vars);
834 int addedNew = addOrMergeAMention(m, vars);
835 if (addedNew == 1) {
836 vars.tlinkMentions++;
837 } else if (addedNew == -1) {
838 vars.tlinkMentionsDiscarded++;
839 }
840
841 }
842 }
843
844 private static int addTlinkRelTypeToMention(URI key, URI value,
845 Record mention, processNAFVariables vars) {
846 String charS = mention.getUnique(NIF.BEGIN_INDEX, Integer.class) + "," + mention
847 .getUnique(NIF.END_INDEX, Integer.class);
848 if (vars.mentionListHash.containsKey(charS)) {
849 vars.mentionListHash.get(charS).add(key, value);
850 return 1;
851 }
852 return -1;
853 }
854
855 private static LinkedList<Wf> reorderWFAscending(LinkedList<Wf> list,
856 processNAFVariables vars) {
857 LinkedList<Wf> tmp = new LinkedList<Wf>();
858 int found = 0;
859 for (Wf wftmp : vars.doc.getText().getWf()) {
860 if (list.contains(wftmp)) {
861 tmp.addLast(wftmp);
862 found++;
863 }
864 if (found >= list.size()) {
865 break;
866 }
867
868 }
869
870 if (found < list.size()) {
871 logDebug("reorderWFAscending method, inconsistency: returned list less than the input list", vars);
872 }
873 return tmp;
874 }
875
876 private static List<Wf> getSpanOfTimex(String tmxId, processNAFVariables vars) {
877 List<Wf> tmp = new ArrayList<Wf>();
878 for (Timex3 tms : vars.doc.getTimeExpressions().getTimex3()) {
879 if (tms.getId().equalsIgnoreCase(tmxId)) {
880 for (Target t : tms.getSpan().getTarget()) {
881 tmp.add((Wf) t.getId());
882 }
883 break;
884 }
885 }
886 return tmp;
887 }
888
889 private static List<Target> getSpanTermsOfPredicate(String predicateId,
890 processNAFVariables vars) {
891 for (Predicate pr : vars.doc.getSrl().getPredicate()) {
892 if (pr.getId().equalsIgnoreCase(predicateId)) {
893 return pr.getSpan().getTarget();
894 }
895 }
896 return null;
897 }
898
899 private static URIImpl getUriForSrlExternalRefResource(String type, String value) {
900 String prefix = null;
901 if (type.equalsIgnoreCase("PropBank")) {
902 prefix = "http://www.newsreader-project.eu/propbank/";
903 } else if (type.equalsIgnoreCase("VerbNet")) {
904 prefix = "http://www.newsreader-project.eu/verbnet/";
905 } else if (type.equalsIgnoreCase("FrameNet")) {
906 prefix = "http://www.newsreader-project.eu/framenet/";
907 } else if (type.equalsIgnoreCase("NomBank")) {
908 prefix = "http://www.newsreader-project.eu/nombank/";
909 } else if (type.equalsIgnoreCase("ESO")) {
910 prefix = "http://www.newsreader-project.eu/domain-ontology#";
911 }
912 if (prefix != null) {
913 return new URIImpl(prefix + value);
914 } else {
915 return null;
916 }
917 }
918
919 private static void getSRLMentions(processNAFVariables vars) throws Exception {
920 Srl obj = vars.doc.getSrl();
921 if (!checkHeaderTextTerms(vars)) {
922 logError("Error: populating interrupted", vars);
923 } else {
924 logDebug("Start mapping the Srl mentions:", vars);
925 }
926
927 if ((obj == null) || (((Srl) obj).getPredicate() == null)) {
928 logError("skipped missing xpath(//NAF/srl)", vars);
929 return;
930 }
931
932
933
934 for (Predicate prdObj : ((Srl) obj).getPredicate()) {
935 String deg = "";
936 Record mtmp = null;
937 String predicateID = prdObj.getId();
938 boolean firstSpanFound = false;
939 int predicatExtRef = 0;
940 String eventMentionId = null;
941 String predicateCharSpan = null;
942 LinkedList<Term> eventTermList = new LinkedList<Term>();
943
944
945
946
947 if (prdObj.getSpan() instanceof Span) {
948 if (firstSpanFound) {
949 logDebug("Srl should have one span only! ", vars);
950 }
951 if (!firstSpanFound) {
952 firstSpanFound = true;
953 }
954 predicateCharSpan = getCharSpanFromSpan(prdObj.getSpan(), vars);
955 mtmp = Record.create();
956 mtmp.add(RDF.TYPE, NWR.EVENT_MENTION, NWR.TIME_OR_EVENT_MENTION,
957 NWR.ENTITY_MENTION, KS.MENTION);
958 deg = "|TYPE:EVENT_MENTION,TIME_OR_EVENT_MENTION,ENTITY_MENTION,MENTION";
959 mtmp.add(KS.MENTION_OF, vars.news_file_id);
960 for (Target tars : prdObj.getSpan().getTarget()) {
961 tars.getId();
962 Term eventTerm = getTermfromTermId((Term) tars.getId(), vars);
963 eventTermList.addLast(eventTerm);
964 if (eventTerm.getLemma() != null) {
965 mtmp.add(NWR.PRED, eventTerm.getLemma());
966 deg += "|PRED:" + eventTerm.getLemma();
967 }
968 if (eventTerm.getPos() != null) {
969 URI posVal = (eventTerm.getPos().equals("V") ||
970 eventTerm.getPos().equals("N"))
971 ? vars.partOfSpeechMapper.get(eventTerm.getPos())
972 : vars.partOfSpeechMapper.get("");
973 mtmp.add(NWR.POS, posVal);
974 deg += "|POS:" + posVal;
975 }
976 }
977 generateTheMIdAndSetID(prdObj.getSpan(), mtmp, vars);
978 deg = "MentionId:" + mtmp.getID() + deg;
979
980 }
981
982
983
984
985
986 List<PredicateAnchor> prds = getAllRelativePredicateAnchors(prdObj.getId(), vars);
987 for (PredicateAnchor tprd : prds) {
988
989 if (tprd.getAnchorTime() != null) {
990 LinkedList<Wf> wfL = reorderWFAscending(
991 fromSpanGetAllMentionsTmx(((Timex3) tprd.getAnchorTime()).getSpan().getTarget(), vars),
992 vars);
993 if (wfL.size() > 0) {
994 Record mtest1 = Record.create();
995 generateMIDAndSetIdWF(wfL, mtest1, vars);
996 mtmp.add(NWR.ANCHOR_TIME, mtest1.getID());
997 }
998 }
999
1000 if (tprd.getBeginPoint() != null) {
1001 LinkedList<Wf> wfL = reorderWFAscending(
1002 fromSpanGetAllMentionsTmx(((Timex3) tprd.getBeginPoint()).getSpan().getTarget(), vars),
1003 vars);
1004 if (wfL.size() > 0) {
1005 Record mtest1 = Record.create();
1006 generateMIDAndSetIdWF(wfL, mtest1, vars);
1007 mtmp.add(NWR.BEGIN_POINT, mtest1.getID());
1008 }
1009 }
1010
1011 if (tprd.getEndPoint() != null) {
1012 LinkedList<Wf> wfL = reorderWFAscending(
1013 fromSpanGetAllMentionsTmx(((Timex3) tprd.getEndPoint()).getSpan().getTarget(), vars), vars);
1014 if (wfL.size() > 0) {
1015 Record mtest1 = Record.create();
1016 generateMIDAndSetIdWF(wfL, mtest1, vars);
1017 mtmp.add(NWR.END_POINT, mtest1.getID());
1018 }
1019 }
1020 }
1021 deg += "| ANCHOR_TIME:" + mtmp.get(NWR.ANCHOR_TIME) + " | BEGIN_POINT:" + mtmp.get(NWR.BEGIN_POINT)
1022 + " | END_POINT:" + mtmp.get(NWR.END_POINT);
1023
1024
1025
1026 for (Object prdGObj : prdObj.getExternalReferencesOrRole()) {
1027
1028
1029
1030
1031 if (prdGObj instanceof ExternalReferences) {
1032 boolean eventTypeFound = false;
1033 if (predicatExtRef > 1) {
1034 logDebug("more than one external ref for predicate:" + predicateID
1035 + " size: " + predicatExtRef, vars);
1036 }
1037 predicatExtRef++;
1038 for (ExternalRef exrObj : ((ExternalReferences) prdGObj).getExternalRef()) {
1039 if (mtmp != null) {
1040
1041
1042 String resourceValue = exrObj.getResource();
1043 String referenceValue = exrObj.getReference();
1044
1045 if (resourceValue != null) {
1046
1047
1048
1049 if (!resourceValue.equalsIgnoreCase("EventType")) {
1050
1051 URI resourceMappedValue = vars.srlExternalRefResourceTypeMapper.get(resourceValue);
1052 URIImpl valueURI;
1053 if (resourceMappedValue != null) {
1054 valueURI = getUriForSrlExternalRefResource(resourceValue, referenceValue);
1055 } else {
1056
1057
1058 resourceMappedValue = ValueFactoryImpl.getInstance()
1059 .createURI(NWR.NAMESPACE, resourceValue.toLowerCase() + "Ref");
1060 valueURI = new URIImpl("http://www.newsreader-project.eu/"
1061 + resourceValue.toLowerCase() + "/" + referenceValue);
1062 }
1063 mtmp.add(resourceMappedValue, valueURI);
1064 deg += "|" + resourceMappedValue + ":" + valueURI;
1065 } else {
1066
1067
1068 URI referenceMappedValue = null;
1069 if (referenceValue != null) {
1070 referenceMappedValue = vars.eventClassMapper.get(referenceValue);
1071 }
1072 if (referenceMappedValue == null) {
1073
1074
1075 referenceMappedValue = NWR.EVENT_SPEECH_COGNITIVE;
1076 }
1077
1078 mtmp.add(NWR.EVENT_CLASS, referenceMappedValue);
1079 deg += "|EVENT_CLASS:" + referenceMappedValue;
1080 eventTypeFound = true;
1081 }
1082 } else {
1083
1084 logDebug(
1085 "xpath(//NAF/srl/predicate/externalReferences/externalRef@Resource(NULL)): predicateID("
1086 + predicateID + ")", vars);
1087 }
1088 } else {
1089 logDebug(
1090 "Mapping error - Mention null - xpath(NAF/srl/predicate/externalReferences/externalRef): predicateID("
1091 + predicateID + ")", vars);
1092 }
1093 }
1094 if (!eventTypeFound) {
1095 mtmp.add(NWR.EVENT_CLASS, vars.eventClassMapper.get("contextual"));
1096 deg += "|EVENT_CLASS:" + vars.eventClassMapper.get("contextual");
1097 eventTypeFound = true;
1098 }
1099 if (eventTypeFound) {
1100 logDebug(deg, vars);
1101 int addedNew = addOrMergeAMention(mtmp, vars);
1102
1103 logDebug("ROL1: <srl> <predicate> adding new event mention for id "
1104 + predicateID + ", charSpan |" + predicateCharSpan + "|", vars);
1105
1106 if (addedNew == 1 | addedNew == 0) {
1107 vars.srlMention2++;
1108 vars.srlMention++;
1109 }
1110 String charS2 = mtmp.getUnique(NIF.BEGIN_INDEX, Integer.class) + "," + mtmp
1111 .getUnique(NIF.END_INDEX, Integer.class);
1112 vars.entityMentions.put(charS2, mtmp);
1113
1114 eventMentionId = mtmp.getID().stringValue();
1115 } else {
1116
1117
1118 logDebug("Mention discarded for predicateID(" + predicateID + ") - ID("
1119 + mtmp.getID().toString() + ")", vars);
1120 }
1121 } else
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134 if (prdGObj instanceof Role) {
1135 boolean MenCreated = false;
1136 String charS = null;
1137 String deg2 = "";
1138 LinkedList<Term> roleTermList = null;
1139
1140 Span roleSpan = ((Role) prdGObj).getSpan();
1141 String roleCharSpan = getCharSpanFromSpan(roleSpan, vars);
1142 boolean createTemporalexpressionMentionFlag = false;
1143 boolean createTlinkFlag = false;
1144 boolean createParticipationMentionFlag = false;
1145
1146 String semRole = ((Role) prdGObj).getSemRole();
1147 if ((semRole != null) && (semRole.equalsIgnoreCase("AM-TMP"))) {
1148 createTlinkFlag = true;
1149 createTemporalexpressionMentionFlag = true;
1150 logDebug(" ROL1: <srl> <role> found TLINK for |" + predicateCharSpan + "|" + roleCharSpan
1151 + "|", vars);
1152 } else {
1153 if (checkAlreadyAcceptedMention(roleCharSpan, vars)) {
1154 createParticipationMentionFlag = true;
1155 logDebug(
1156 " ROL1: <srl> <role> found already existent mention for |" + roleCharSpan + "|",
1157 vars);
1158 }
1159 }
1160
1161
1162
1163
1164
1165 if (createTemporalexpressionMentionFlag) {
1166 if (!checkAlreadyAcceptedMention(roleCharSpan, vars)) {
1167 Record roleM = Record.create();
1168 roleM.add(RDF.TYPE, NWR.TIME_MENTION, NWR.TIME_OR_EVENT_MENTION,
1169 NWR.ENTITY_MENTION, KS.MENTION);
1170 roleM.add(KS.MENTION_OF, vars.news_file_id);
1171
1172
1173 roleTermList = new LinkedList<Term>();
1174 for (Target rspnTar : ((Role) prdGObj).getSpan().getTarget()) {
1175 Term ttmp = getTermfromTermId((Term) rspnTar.getId(), vars);
1176 roleTermList.addLast(ttmp);
1177 }
1178
1179
1180 generateTheMIdAndSetID(roleSpan, roleM, vars);
1181
1182
1183 if (addOrMergeAMention(roleM, vars) == 1) {
1184 vars.timeMention2++;
1185 }
1186
1187 logDebug(" ROL1: created temporal-expression mention for |" + roleCharSpan + "|",
1188 vars);
1189 }
1190 }
1191
1192
1193
1194
1195
1196
1197
1198 if (createParticipationMentionFlag || createTlinkFlag) {
1199 Record relationM = Record.create();
1200
1201
1202
1203
1204 if (eventMentionId != null) {
1205 relationM.add(NWR.SOURCE, new URIImpl(eventMentionId));
1206 deg2 += "|SOURCE:" + eventMentionId;
1207 } else {
1208 logDebug("//NAF/srl/predicate/role/ - a Role without Predicate roleID("
1209 + ((Role) prdGObj).getId() + ")", vars);
1210 }
1211
1212
1213
1214
1215 {
1216 URI roleId = getMentionIDFromCharSpan(roleCharSpan, vars);
1217 relationM.add(NWR.TARGET, roleId);
1218 }
1219
1220
1221
1222
1223 if (createParticipationMentionFlag) {
1224 relationM.add(RDF.TYPE, NWR.PARTICIPATION,
1225 NWR.RELATION_MENTION, KS.MENTION);
1226 deg2 = "|TYPE:PARTICIPATION,RELATION_MENTION,MENTION|" + deg2;
1227 relationM.add(NWR.THEMATIC_ROLE, ((Role) prdGObj).getSemRole());
1228 deg2 += "|THEMATIC_ROLE:" + ((Role) prdGObj).getSemRole();
1229 } else {
1230 relationM.add(RDF.TYPE, NWR.TLINK,
1231 NWR.RELATION_MENTION, KS.MENTION);
1232 deg2 = "|TYPE:TLINK,RELATION_MENTION,MENTION|" + deg2;
1233 }
1234 relationM.add(KS.MENTION_OF, vars.news_file_id);
1235
1236
1237
1238
1239
1240 if (roleTermList == null) {
1241 roleTermList = new LinkedList<Term>();
1242 for (Target rspnTar : ((Role) prdGObj).getSpan().getTarget()) {
1243 Term ttmp = getTermfromTermId((Term) rspnTar.getId(), vars);
1244 roleTermList.addLast(ttmp);
1245 }
1246 }
1247 generateTheMIdAndSetID_forParticipationMention(eventTermList, roleTermList, relationM,
1248 vars);
1249 deg2 = "MentionId:" + relationM.getID() + deg2;
1250 boolean create = false;
1251
1252
1253
1254
1255 int addedNew = -1;
1256 MenCreated = true;
1257
1258 charS = relationM.getUnique(NIF.BEGIN_INDEX, Integer.class) + "," + relationM
1259 .getUnique(NIF.END_INDEX, Integer.class);
1260 if (createParticipationMentionFlag) {
1261 logDebug(" ROL1: <srl> <predicate> <role> adding new participation mention for |"
1262 + charS + "|", vars);
1263 } else {
1264 logDebug(
1265 " ROL1: <srl> <predicate> <role> adding new TLINK mention for |" + charS + "|",
1266 vars);
1267 }
1268
1269
1270
1271
1272 addedNew = addOrMergeAMention(relationM, vars);
1273 if (addedNew == 1) {
1274 vars.rolewithEntity++;
1275 vars.rolewithEntity2++;
1276 vars.roleMentions++;
1277 } else {
1278
1279 if (create) {
1280 vars.rolewithoutEntity++;
1281 }
1282 }
1283
1284
1285
1286
1287 for (ExternalReferences roleGOBJ : ((Role) prdGObj).getExternalReferences()) {
1288 for (ExternalRef rexRefObj : roleGOBJ.getExternalRef()) {
1289
1290
1291
1292 String resourceValue = rexRefObj.getResource();
1293 String referenceValue = rexRefObj.getReference();
1294
1295 if (resourceValue != null) {
1296
1297 URI resourceMappedValue = vars.srlExternalRefResourceTypeMapper
1298 .get(resourceValue);
1299 URIImpl valueURI;
1300 if (resourceMappedValue != null) {
1301 valueURI = getUriForSrlExternalRefResource(resourceValue, referenceValue);
1302 } else {
1303
1304
1305 resourceMappedValue = ValueFactoryImpl.getInstance()
1306 .createURI(NWR.NAMESPACE, resourceValue.toLowerCase() + "Ref");
1307 valueURI = new URIImpl("http://www.newsreader-project.eu/"
1308 + resourceValue.toLowerCase() + "/" + referenceValue);
1309 }
1310 if (charS != null) {
1311 vars.mentionListHash.get(charS).add(resourceMappedValue, valueURI);
1312 }
1313 deg2 += "|" + resourceMappedValue + ":" + valueURI;
1314 } else {
1315
1316 logDebug(
1317 "xpath(//NAF/srl/predicate/role/externalReferences/externalRef@Resource(NULL)): RoleID("
1318 + ((Role) prdGObj).getId() + ")", vars);
1319 }
1320 }
1321
1322 }
1323 }
1324 logDebug(deg2, vars);
1325
1326 }
1327
1328 }
1329
1330 }
1331 }
1332
1333 private static List<PredicateAnchor> getAllRelativePredicateAnchors(String id,
1334 processNAFVariables vars) {
1335 List<PredicateAnchor> tmp = new ArrayList<PredicateAnchor>();
1336 for (PredicateAnchor pas : vars.doc.getTemporalRelations().getPredicateAnchor()) {
1337 for (Span t : pas.getSpan()) {
1338 for (Target tm : t.getTarget()) {
1339 if (((Predicate) tm.getId()).getId().equalsIgnoreCase(id)) {
1340 tmp.add(pas);
1341 break;
1342 }
1343 }
1344 }
1345 }
1346 return tmp;
1347 }
1348
1349 private static void getNAFHEADERMentions(NafHeader obj, processNAFVariables vars) throws Exception {
1350 logDebug("Start reading the naf metadata:", vars);
1351 String deg = "";
1352 Public publicProp = ((NafHeader) obj).getPublic();
1353 initURIIDS(publicProp, vars);
1354 Record newsFile = Record.create();
1355 Record nafFile = Record.create();
1356 vars.nafFile2 = nafFile;
1357 vars.newsFile2 = newsFile;
1358 newsFile.setID(vars.news_file_id);
1359 nafFile.setID(vars.NAF_file_id);
1360 deg += "news_file_id:" + vars.news_file_id;
1361 newsFile.add(RDF.TYPE, NWR.NEWS);
1362 deg += "\nNAF_file_id:" + vars.NAF_file_id;
1363
1364 nafFile.add(RDF.TYPE, NWR.NAFDOCUMENT);
1365 nafFile.add(NWR.ANNOTATION_OF, newsFile.getID());
1366 newsFile.add(NWR.ANNOTATED_WITH, nafFile.getID());
1367 if (vars.doc.getVersion() != null) {
1368 nafFile.add(NWR.VERSION, vars.doc.getVersion());
1369 deg += "\nVERSION:" + vars.doc.getVersion();
1370 }
1371
1372
1373
1374
1375 URIImpl sourceURL;
1376 if (vars.PREFIX.matches("(?i)http://www.newsreader-project.eu/LNdata.*")) {
1377
1378 String preStr = "http://www.lexisnexis.com/uk/nexis/docview/getDocForCuiReq?lni=";
1379 String postStr = "&csi=138620&perma=true";
1380 String srcUrlstr = new String(preStr + publicProp.getPublicId() + postStr);
1381 sourceURL = new URIImpl(srcUrlstr);
1382 } else {
1383
1384 sourceURL = (URIImpl) vars.news_file_id;
1385 }
1386 newsFile.add(DCTERMS.SOURCE, sourceURL);
1387
1388 if (publicProp.getPublicId() != null) {
1389 nafFile.add(DCTERMS.IDENTIFIER, publicProp.getPublicId());
1390 deg += "|IDENTIFIER:" + publicProp.getPublicId();
1391 }
1392 if (vars.doc.getXmlLang() != null) {
1393 newsFile.add(DCTERMS.LANGUAGE, Data.languageCodeToURI(vars.doc.getXmlLang()));
1394 deg += "|LANGUAGE:" + Data.languageCodeToURI(vars.doc.getXmlLang());
1395 } else {
1396 logWarn("Language not catched:" + vars.doc.getXmlLang(), vars);
1397 }
1398 FileDesc fileDesc = null;
1399 if (((NafHeader) obj).getFileDesc() != null) {
1400 fileDesc = ((NafHeader) obj).getFileDesc();
1401 if (fileDesc.getTitle() != null) {
1402 newsFile.add(DCTERMS.TITLE, fileDesc.getTitle());
1403 deg += "|TITLE:" + fileDesc.getTitle();
1404 }
1405 if (fileDesc.getAuthor() != null) {
1406 newsFile.add(DCTERMS.CREATOR, fileDesc.getAuthor());
1407 deg += "|Author:" + fileDesc.getAuthor();
1408 }
1409 if (fileDesc.getCreationtime() != null) {
1410 newsFile.add(DCTERMS.CREATED, fileDesc.getCreationtime());
1411 deg += "|Creationtime:" + fileDesc.getCreationtime();
1412 }
1413
1414 if (fileDesc.getSection() != null) {
1415 newsFile.add(NWR.SECTION, fileDesc.getSection());
1416 deg += "|SECTION:" + fileDesc.getSection();
1417 }
1418
1419 if (fileDesc.getMagazine() != null) {
1420 newsFile.add(NWR.MAGAZINE, fileDesc.getMagazine());
1421 deg += "|MAGAZINE:" + fileDesc.getMagazine();
1422 }
1423
1424 if (fileDesc.getLocation() != null) {
1425 newsFile.add(NWR.LOCATION, fileDesc.getLocation());
1426 deg += "|LOCATION:" + fileDesc.getLocation();
1427 }
1428
1429 if (fileDesc.getPublisher() != null) {
1430 newsFile.add(NWR.PUBLISHER, fileDesc.getPublisher());
1431 deg += "|PUBLISHER:" + fileDesc.getPublisher();
1432 }
1433
1434 if (fileDesc.getFilename() != null) {
1435 newsFile.add(NWR.ORIGINAL_FILE_NAME, fileDesc.getFilename());
1436 deg += "|Filename:" + fileDesc.getFilename();
1437 }
1438
1439 if (fileDesc.getFiletype() != null) {
1440 newsFile.add(NWR.ORIGINAL_FILE_FORMAT, fileDesc.getFiletype());
1441 deg += "|Filetype:" + fileDesc.getFiletype();
1442 }
1443
1444 if (fileDesc.getPages() != null) {
1445 newsFile.add(NWR.ORIGINAL_PAGES, fileDesc.getPages());
1446 deg += "|Pages:" + fileDesc.getPages();
1447 }
1448 } else {
1449 logWarn("FileDesc: null", vars);
1450 }
1451 for (LinguisticProcessors lpObj : ((NafHeader) obj).getLinguisticProcessors()) {
1452 deg += "\n";
1453 if (lpObj.getLayer() != null) {
1454 if (vars.nafLayerMapper.containsKey(lpObj.getLayer())
1455 && vars.nafLayerMapper.get(lpObj.getLayer()) != null) {
1456 nafFile.add(NWR.LAYER, vars.nafLayerMapper.get(lpObj.getLayer()));
1457 deg += "LAYER:" + vars.nafLayerMapper.get(lpObj.getLayer());
1458 } else {
1459 logDebug("xpath(//NAF/nafHeader/linguisticProcessors/@layer["
1460 + lpObj.getLayer() + "]), unknown layer.", vars);
1461 }
1462 }
1463
1464 for (Lp lpO : lpObj.getLp()) {
1465 deg += "\n";
1466 Record r3 = Record.create();
1467 if (lpO.getName() != null) {
1468 r3.add(DCTERMS.TITLE, lpO.getName());
1469 deg += "TITLE:" + lpO.getName();
1470 }
1471 if (lpO.getVersion() != null) {
1472 r3.add(NWR.VERSION, lpO.getVersion());
1473 deg += "|VERSION:" + lpO.getVersion();
1474 }
1475 String namuri = java.net.URLEncoder.encode(lpO.getName());
1476 String uri = vars.PREFIX + (vars.PREFIX.endsWith("/") ? "" : "/") + "lp/" + namuri + "/"
1477 + lpO.getVersion();
1478
1479 URI rId = new URIImpl(uri);
1480 r3.setID(rId);
1481 nafFile.add(NWR.MODULES, r3);
1482 deg += "|CREATOR:" + r3;
1483 }
1484 }
1485
1486 logDebug(deg, vars);
1487 }
1488
1489 private static void getCoreferencesMentions(Coreferences obj, processNAFVariables vars) throws Exception {
1490 if (!checkHeaderTextTerms(vars)) {
1491 logError("Error: populating interrupted", vars);
1492 } else {
1493 logDebug("Start mapping the Coreferences mentions:", vars);
1494 }
1495 String deg = "\n";
1496
1497
1498
1499
1500 for (Coref corefObj : ((Coreferences) obj).getCoref()) {
1501 deg = "";
1502 if (corefObj.getSpan().size() < 1) {
1503 logDebug("Every coref must contain a 'span' element inside 'references'", vars);
1504 }
1505
1506
1507
1508
1509
1510
1511 boolean addMentionsFlag = false;
1512
1513 String corefType = corefObj.getType();
1514 List<Object> typesOfIncludedMention = null;
1515 boolean eventM = false;
1516 if (corefType != null && corefType.equalsIgnoreCase("event")) {
1517
1518 addMentionsFlag = true;
1519 eventM = true;
1520 } else {
1521
1522
1523 for (Span corefSpan : corefObj.getSpan()) {
1524 String corefCharSpan = getCharSpanFromSpan(corefSpan, vars);
1525 Object[] retArray = checkSpanIncludesAnAlreadyAcceptedMention(corefCharSpan, vars);
1526 int inclFlag = ((Integer) retArray[0]).intValue();
1527 if ((inclFlag == 1) || (inclFlag == 2)) {
1528 addMentionsFlag = true;
1529 String includedMentionCharSpan = (String) retArray[1];
1530 typesOfIncludedMention = getMentionTypeFromCharSpan(includedMentionCharSpan, vars);
1531 logDebug("ROL1: <coref> id " + corefObj.getId() + ": found included mention for |"
1532 + corefCharSpan + "|, included mention |" + includedMentionCharSpan
1533 + "|, inclFlag " + inclFlag
1534 + ", types " + getTypeasString(typesOfIncludedMention), vars);
1535 break;
1536 }
1537 }
1538 }
1539
1540 if (addMentionsFlag) {
1541 for (Span corefSpan : corefObj.getSpan()) {
1542 String corefCharSpan = getCharSpanFromSpan(corefSpan, vars);
1543 if (checkAlreadyAcceptedMention(corefCharSpan, vars)) {
1544 logDebug("ROL1: <coref> id " + corefObj.getId()
1545 + ": skipping already existent mention with charSpan |"
1546 + corefCharSpan + "|", vars);
1547 continue;
1548 }
1549 deg = "";
1550 Record m = Record.create();
1551 m.add(KS.MENTION_OF, vars.news_file_id);
1552 deg += "MENTION_OF:" + vars.news_file_id;
1553 if (corefObj.getSpan().size() > 1) {
1554 m.add(NWR.LOCAL_COREF_ID, corefObj.getId());
1555 deg += "|LOCAL_COREF_ID:" + corefObj.getId();
1556 }
1557
1558 if (eventM) {
1559 m.add(RDF.TYPE, NWR.EVENT_MENTION, NWR.TIME_OR_EVENT_MENTION,
1560 NWR.ENTITY_MENTION, KS.MENTION);
1561 deg = "|TYPE:EVENT_MENTION,TIME_OR_EVENT_MENTION,ENTITY_MENTION,ENTITY_MENTION"
1562 + deg;
1563 eventM = true;
1564 } else {
1565 m.add(RDF.TYPE, NWR.OBJECT_MENTION, NWR.ENTITY_MENTION, KS.MENTION);
1566 deg = "TYPE:,OBJECT_MENTION,ENTITY_MENTION,ENTITY_MENTION|" + deg;
1567
1568
1569 if (typesOfIncludedMention != null) {
1570 m.add(RDF.TYPE, typesOfIncludedMention);
1571 }
1572 }
1573 logDebug("ROL1: <coref> id " + corefObj.getId() + ": adding new mention with charSpan |"
1574 + corefCharSpan + "|, and type " + m.get(RDF.TYPE), vars);
1575
1576 if (corefSpan.getTarget().size() < 1) {
1577 logDebug("Every span in an entity must contain at least one target inside", vars);
1578 }
1579 for (Target spTar : corefSpan.getTarget()) {
1580 if (eventM) {
1581 Term eventTerm = getTermfromTermId((Term) spTar.getId(), vars);
1582 m.add(NWR.PRED, eventTerm.getLemma());
1583 deg += "|PRED:" + eventTerm.getLemma();
1584 if (eventTerm.getPos() != null) {
1585 URI posVal = (eventTerm.getPos().equals("V") ||
1586 eventTerm.getPos().equals("N"))
1587 ? vars.partOfSpeechMapper.get(eventTerm.getPos())
1588 : vars.partOfSpeechMapper.get("");
1589 m.add(NWR.POS, posVal);
1590 deg += "|POS:" + posVal;
1591 } else {
1592 logDebug("//NAF/coreferences/coref/span/target/@id/@getPOS[null], id("
1593 + eventTerm.getId() + ")", vars);
1594 }
1595 }
1596 if (!eventM && spTar.getHead() != null && spTar.getHead().equals("yes")) {
1597 if (spTar.getId() != null) {
1598 m.add(NWR.SYNTACTIC_HEAD, spTar.getId());
1599 deg += "|SYNTACTIC_HEAD:" + spTar.getId();
1600 } else {
1601 logDebug("//NAF/coreferences/coref/span/target[@head='yes']/@id[null], id("
1602 + spTar.getId() + ")", vars);
1603 }
1604 }
1605 }
1606 generateTheMIdAndSetID(corefSpan, m, vars);
1607 deg = "MentionId:" + m.getID() + deg;
1608 logDebug(deg, vars);
1609
1610 int addedNew = addOrMergeAMention(m, vars);
1611
1612 if (!eventM) {
1613
1614 if (addedNew == 1) {
1615 vars.corefMention2++;
1616 vars.no_mapping++;
1617 vars.corefMentionNotEvent++;
1618 vars.corefMention++;
1619
1620 }
1621 } else {
1622
1623 if (addedNew == 1) {
1624 vars.srlMention2++;
1625 vars.corefMentionEvent++;
1626 vars.corefMention++;
1627 }
1628 }
1629 String charS2 = m.getUnique(NIF.BEGIN_INDEX, Integer.class) + "," + m
1630 .getUnique(NIF.END_INDEX, Integer.class);
1631 vars.entityMentions.put(charS2, m);
1632 }
1633 } else {
1634 logDebug("ROL1: <coref> id " + corefObj.getId() + ": entirely skipped, NO included mentions", vars);
1635 }
1636 }
1637 }
1638
1639 private static LinkedList<Term> mergeTwoTermLists(LinkedList<Term> eventTermList,
1640 LinkedList<Term> roleTermList, processNAFVariables vars) {
1641
1642 LinkedList<Term> merged = new LinkedList<Term>();
1643
1644
1645
1646 for (Term evn : eventTermList) {
1647 merged.addLast(evn);
1648 }
1649 for (Term rol : roleTermList) {
1650 merged.addLast(rol);
1651 }
1652 logDebug("Two lists merged: eventTermListSize(" + eventTermList.size()
1653 + ") + roleTermListSize(" + roleTermList.size() + ") = mergedListSize("
1654 + merged.size() + ").", vars);
1655 return merged;
1656 }
1657
1658
1659
1660 private static boolean checkAlreadyAcceptedMention(String charSpan, processNAFVariables vars) {
1661 return vars.mentionListHash.containsKey(charSpan);
1662 }
1663
1664
1665
1666
1667
1668
1669
1670 private static Object[] checkSpanIncludesAnAlreadyAcceptedMention(String charSpan, processNAFVariables vars) {
1671 Object[] retArray = new Object[2];
1672
1673 if (checkAlreadyAcceptedMention(charSpan, vars)) {
1674 retArray[0] = new Integer(1);
1675 retArray[1] = charSpan;
1676 return retArray;
1677 }
1678
1679
1680 String[] fields = charSpan.split(",");
1681 int spanBeginC = Integer.parseInt(fields[0]);
1682 int spanEndC = Integer.parseInt(fields[1]);
1683
1684 Enumeration keys = vars.mentionListHash.keys();
1685 String[] kfields;
1686 int kBeginC;
1687 int kEndC;
1688 while (keys.hasMoreElements()) {
1689 String key = (String) keys.nextElement();
1690 kfields = key.split(",");
1691 kBeginC = Integer.parseInt(kfields[0]);
1692 kEndC = Integer.parseInt(kfields[1]);
1693 if ((kBeginC >= spanBeginC) && (kEndC <= spanEndC)) {
1694 retArray[0] = new Integer(2);
1695 retArray[1] = key;
1696 return retArray;
1697 }
1698 }
1699
1700 retArray[0] = new Integer(0);
1701 retArray[1] = null;
1702 return retArray;
1703 }
1704
1705
1706
1707 private static String getCharSpanFromSpan(Span sp, processNAFVariables vars) {
1708 LinkedList<Wf> wordsL = fromSpanGetAllMentions(sp.getTarget(), vars);
1709 String begin = wordsL.getFirst().getOffset();
1710 Wf lastW = wordsL.getLast();
1711 int end = Integer.parseInt(lastW.getOffset()) + Integer.parseInt(lastW.getLength());
1712 String charSpan = begin + "," + Integer.toString(end);
1713 return charSpan;
1714 }
1715
1716
1717
1718 private static URI getMentionIDFromCharSpan(String charSpan, processNAFVariables vars) {
1719 if (vars.mentionListHash.containsKey(charSpan)) {
1720 return vars.mentionListHash.get(charSpan).getID();
1721 } else {
1722 return null;
1723 }
1724 }
1725
1726
1727
1728 private static List<Object> getMentionTypeFromCharSpan(String charSpan, processNAFVariables vars) {
1729 if (vars.mentionListHash.containsKey(charSpan)) {
1730 return vars.mentionListHash.get(charSpan).get(RDF.TYPE);
1731 } else {
1732 return null;
1733 }
1734 }
1735
1736
1737
1738
1739
1740
1741
1742 private static Integer addOrMergeAMention(Record m, processNAFVariables vars) {
1743 String charS = m.getUnique(NIF.BEGIN_INDEX, Integer.class) + "," + m.getUnique(NIF.END_INDEX, Integer.class);
1744 if (vars.mentionListHash.containsKey(charS)) {
1745
1746
1747
1748 boolean chk = checkClassCompatibility(vars.mentionListHash.get(charS), m);
1749 if (!chk) {
1750
1751
1752
1753
1754 if (checkMentionReplaceability(vars.mentionListHash.get(charS), m)) {
1755
1756 vars.mentionListHash.put(charS, m);
1757 logDebug("Replacement with Mention: " + m.getID() + ", class(" + getTypeasString(m.get(RDF.TYPE))
1758 + ")", vars);
1759 return 0;
1760 }
1761
1762 String types = getTypeasString(m.get(RDF.TYPE));
1763 if (types.contains(NWR.PARTICIPATION.stringValue())) {
1764 logDebug("Participation collision error, mentionID(" + m.getID() + ") class1(" + getTypeasString(
1765 m.get(RDF.TYPE)) + "), class-pre-xtracted(" + getTypeasString(
1766 vars.mentionListHash.get(charS).get(RDF.TYPE)) + ")", vars);
1767 } else {
1768 logDebug("Generic collision error, mentionID(" + m.getID() + ") class1(" + getTypeasString(
1769 m.get(RDF.TYPE)) + "), class-pre-xtracted(" + getTypeasString(
1770 vars.mentionListHash.get(charS).get(RDF.TYPE)) + ")", vars);
1771 }
1772 return -1;
1773 } else {
1774
1775
1776
1777
1778 String types = getTypeasString(m.get(RDF.TYPE));
1779 if (types.contains(NWR.PARTICIPATION.stringValue())) {
1780 logDebug("Refused enrichment with participation mention, mentionID(" + m.getID() + ")", vars);
1781 return -1;
1782 }
1783
1784 ListIterator<URI> mit = m.getProperties().listIterator();
1785 while (mit.hasNext()) {
1786 URI mittmp = mit.next();
1787
1788 for (Object pit : m.get(mittmp)) {
1789 vars.mentionListHash.get(charS).add(mittmp, pit);
1790 }
1791 }
1792 logDebug("Mention enrichment: " + m.getID() + ", class(" + getTypeasString(m.get(RDF.TYPE)) + ")",
1793 vars);
1794 return 0;
1795 }
1796
1797 } else {
1798
1799
1800
1801 vars.mentionListHash.put(charS, m);
1802 logDebug("Created Mention: " + m.getID(), vars);
1803 return 1;
1804 }
1805 }
1806
1807
1808
1809
1810 private static void fixMentions(processNAFVariables vars) {
1811 Enumeration keys = vars.mentionListHash.keys();
1812 while (keys.hasMoreElements()) {
1813 String key = (String) keys.nextElement();
1814 Record m = (Record) vars.mentionListHash.get(key);
1815
1816
1817
1818 String[] csList = key.split(",");
1819 int cStart = Integer.parseInt(csList[0]);
1820 int cEnd = Integer.parseInt(csList[1]);
1821 String extentStr = vars.rawText.substring(cStart, cEnd);
1822 m.add(NIF.ANCHOR_OF, extentStr);
1823 }
1824 }
1825
1826 private static String getTypeasString(List<Object> list) {
1827 String tmp = "";
1828 for (Object ll : list) {
1829 tmp += ll.toString() + ",";
1830 }
1831 tmp += "\"";
1832 return tmp.replace(",\"", "");
1833 }
1834
1835 private static boolean checkClassCompatibility(Record m, Record m2) {
1836 List<Object> types = m.get(RDF.TYPE);
1837 List<Object> types1 = m2.get(RDF.TYPE);
1838 for (Object tytmp : types) {
1839 if (!types1.contains(tytmp)) {
1840 return false;
1841 }
1842 }
1843 return true;
1844 }
1845
1846 private static boolean checkMentionReplaceability(Record oldM, Record newM) {
1847
1848
1849
1850
1851
1852 List<Object> typesOld = oldM.get(RDF.TYPE);
1853 List<Object> typesNew = newM.get(RDF.TYPE);
1854 boolean isGenericOldM = (typesOld.contains(NWR.OBJECT_MENTION) && (oldM.get(NWR.ENTITY_TYPE) == null
1855 || oldM.get(NWR.ENTITY_TYPE).size() == 0));
1856 boolean isSpecificNewM = ((typesNew.contains(NWR.OBJECT_MENTION) && (newM.get(NWR.ENTITY_TYPE) != null
1857 && oldM.get(NWR.ENTITY_TYPE).size() > 0))
1858 || typesNew.contains(NWR.TIME_MENTION)
1859 || typesNew.contains(NWR.EVENT_MENTION));
1860
1861
1862
1863
1864 if (isGenericOldM && isSpecificNewM) {
1865 return true;
1866 } else {
1867 return false;
1868 }
1869 }
1870
1871 private static void initURIIDS(Public publicProp, processNAFVariables vars) throws Exception {
1872 if (publicProp.getPublicId() == null) {
1873 logError("Corrupted Naf file: PublicId in the Naf header is missed", vars);
1874 throw new Exception();
1875 }
1876 vars.nafPublicId = publicProp.getPublicId();
1877 String uri = publicProp.getUri();
1878
1879
1880 vars.news_file_id = new URIImpl(uri);
1881 String nafuri = uri + ".naf";
1882 vars.NAF_file_id = new URIImpl(nafuri);
1883 logDebug("news_file_id: " + uri, vars);
1884 logDebug("NAF_file_id: " + nafuri, vars);
1885
1886
1887 try {
1888 URL nurl = new URL(uri);
1889 Path p = Paths.get(nurl.getPath());
1890 vars.PREFIX = nurl.getProtocol() + "://" + nurl.getAuthority() + "/" + p.subpath(0, 2);
1891 } catch (Exception me) {
1892 vars.PREFIX = vars.news_file_id.getNamespace();
1893 }
1894 logDebug("PREFIX: " + vars.PREFIX, vars);
1895 }
1896
1897 static void generateMIDAndSetIdWF(LinkedList<Wf> wordsL, Record m, processNAFVariables vars) {
1898 int begin = Integer.parseInt(wordsL.getFirst().getOffset());
1899 int end = (Integer.parseInt(wordsL.getLast().getOffset()) + Integer.parseInt(wordsL
1900 .getLast().getLength()));
1901 m.add(NIF.BEGIN_INDEX, begin);
1902 m.add(NIF.END_INDEX, end);
1903 String tmpid = vars.news_file_id + "#char=" + begin + "," + end;
1904 URI mId = new URIImpl(tmpid);
1905 m.setID(mId);
1906 }
1907
1908 private static void logError(String error, processNAFVariables vars) throws Exception {
1909 if (vars.logErrorActive) {
1910 vars.logger.error(vars.filePath.getName() + " " + error);
1911 }
1912 if (!vars.storePartialInforInCaseOfError) {
1913 throw new Exception();
1914
1915 }
1916
1917 }
1918
1919 private static void logDebug(String error, processNAFVariables vars) {
1920 if (vars.logDebugActive) {
1921 vars.logger.debug(error);
1922 }
1923 }
1924
1925 private static void logWarn(String error, processNAFVariables vars) {
1926 vars.logger.warn(vars.filePath.getName() + " " + error);
1927 }
1928
1929 private static boolean checkHeaderTextTerms(processNAFVariables vars) {
1930 if (vars.globalTerms == null) {
1931 logWarn("Error: No term(s) has been catched!", vars);
1932 return false;
1933 }
1934 if (vars.globalText == null) {
1935 logWarn("Error: No text(s) has been catched!", vars);
1936 return false;
1937 }
1938 return true;
1939 }
1940
1941 public static Logger getLogger(processNAFVariables vars) {
1942 return vars.logger;
1943 }
1944
1945 public static void setLogger(final Logger logger, processNAFVariables vars) {
1946 vars.logger = logger;
1947 }
1948
1949 private static void generateTheMIdAndSetID(Span spansObj, Record m, processNAFVariables vars) {
1950 LinkedList<Wf> wordsL = fromSpanGetAllMentions(((Span) spansObj).getTarget(), vars);
1951 int begin = Integer.parseInt(wordsL.getFirst().getOffset());
1952 int end = (Integer.parseInt(wordsL.getLast().getOffset()) + Integer.parseInt(wordsL
1953 .getLast().getLength()));
1954 m.add(NIF.BEGIN_INDEX, begin);
1955 m.add(NIF.END_INDEX, end);
1956 String muri = vars.news_file_id + "#char=" + begin + "," + end;
1957 URI mId = new URIImpl(muri);
1958 m.setID(mId);
1959 }
1960
1961
1962
1963
1964 private static void generateTheMIdAndSetID_forParticipationMention(LinkedList<Term> eventTermList,
1965 LinkedList<Term> roleTermList,
1966 Record m, processNAFVariables vars) {
1967 LinkedList<Wf> eventWordList = getTheWFListByThereTermsFromTargetList(eventTermList, vars);
1968 LinkedList<Wf> roleWordList = getTheWFListByThereTermsFromTargetList(roleTermList, vars);
1969
1970 int charStartOfEvent = Integer.parseInt(eventWordList.getFirst().getOffset());
1971 int charEndOfEvent = Integer.parseInt(eventWordList.getLast().getOffset()) + Integer
1972 .parseInt(eventWordList.getLast().getLength());
1973 int charStartOfRole = Integer.parseInt(roleWordList.getFirst().getOffset());
1974 int charEndOfRole = Integer.parseInt(roleWordList.getLast().getOffset()) + Integer
1975 .parseInt(roleWordList.getLast().getLength());
1976
1977 int beginIndex, endIndex;
1978 if (charStartOfEvent < charStartOfRole) {
1979 beginIndex = charStartOfEvent;
1980 endIndex = charEndOfRole;
1981 } else {
1982 beginIndex = charStartOfRole;
1983 endIndex = charEndOfEvent;
1984 }
1985 m.add(NIF.BEGIN_INDEX, beginIndex);
1986 m.add(NIF.END_INDEX, endIndex);
1987
1988 String muri = vars.news_file_id + "#char=" + beginIndex + "," + endIndex;
1989 URI mId = new URIImpl(muri);
1990 m.setID(mId);
1991 }
1992
1993
1994
1995
1996 private static String getExtentOfParticipationMention(LinkedList<Term> eventTermList,
1997 LinkedList<Term> roleTermList, processNAFVariables vars) {
1998 LinkedList<Wf> eventWordList = getTheWFListByThereTermsFromTargetList(eventTermList, vars);
1999 LinkedList<Wf> roleWordList = getTheWFListByThereTermsFromTargetList(roleTermList, vars);
2000
2001 LinkedList<Wf> mergedWordList = new LinkedList<Wf>();
2002
2003 int charStartOfEvent = Integer.parseInt(eventWordList.getFirst().getOffset());
2004 int charStartOfRole = Integer.parseInt(roleWordList.getFirst().getOffset());
2005
2006 LinkedList<Wf> firstWL, secondWL;
2007 if (charStartOfEvent <= charStartOfRole) {
2008 firstWL = eventWordList;
2009 secondWL = roleWordList;
2010 } else {
2011 firstWL = roleWordList;
2012 secondWL = eventWordList;
2013 }
2014 for (Wf w : firstWL) {
2015 if (!mergedWordList.contains(w)) {
2016 mergedWordList.add(w);
2017 }
2018 }
2019 for (Wf w : secondWL) {
2020 if (!mergedWordList.contains(w)) {
2021 mergedWordList.add(w);
2022 }
2023 }
2024
2025 StringBuffer extent = new StringBuffer();
2026 for (Wf w : mergedWordList) {
2027 extent.append(w.getvalue() + " ");
2028 }
2029 String sExtent = extent.toString();
2030 return sExtent.substring(0, sExtent.length() - 1);
2031 }
2032
2033
2034
2035
2036 private static boolean checkDuplicate(String muri, processNAFVariables vars) {
2037 boolean re = false;
2038 for (String keys : vars.entityMentions.keySet()) {
2039 if (keys.equals(muri)) {
2040
2041 re = true;
2042 break;
2043 }
2044 }
2045 return re;
2046 }
2047
2048 private static Term getTermfromTermId(Term termId, processNAFVariables vars) {
2049 if (vars.globalTerms != null) {
2050 if (vars.globalTerms.getTerm().contains(termId)) {
2051 return vars.globalTerms.getTerm().get(vars.globalTerms.getTerm().indexOf(termId));
2052 }
2053 } else {
2054 Terms ltmp = vars.doc.getTerms();
2055 if (((Terms) ltmp).getTerm().contains(termId)) {
2056 return vars.globalTerms.getTerm().get(vars.globalTerms.getTerm().indexOf(termId));
2057 }
2058
2059 }
2060 logWarn("Term is not found, searched TermId(" + termId.getId() + ")", vars);
2061 return null;
2062 }
2063
2064 private static LinkedList<Wf> fromSpanGetAllMentionsTmx(List<Target> list, processNAFVariables vars) {
2065 LinkedList<Wf> returned = new LinkedList<Wf>();
2066 LinkedList<Wf> wordsIDL = new LinkedList<Wf>();
2067 for (Target ltmp : list) {
2068 wordsIDL.addLast((Wf) ltmp.getId());
2069 }
2070 if (vars.globalText != null) {
2071
2072 int found = 0;
2073 for (Wf wftmp : vars.globalText.getWf()) {
2074 if (wordsIDL.contains(wftmp)) {
2075 returned.addLast(wftmp);
2076 found++;
2077 }
2078 if (found >= wordsIDL.size()) {
2079 break;
2080 }
2081 }
2082 } else {
2083 Text prop = vars.doc.getText();
2084 int found = 0;
2085 for (Wf wftmp : prop.getWf()) {
2086 if (wordsIDL.contains(wftmp)) {
2087 returned.addLast(wftmp);
2088 found++;
2089 }
2090 if (found >= wordsIDL.size()) {
2091 break;
2092 }
2093 }
2094
2095 }
2096 return returned;
2097 }
2098
2099 private static LinkedList<Wf> getTheWFListByThereTermsFromTargetList(LinkedList<Term> targetTermList,
2100 processNAFVariables vars) {
2101 LinkedList<Wf> returned = new LinkedList<Wf>();
2102 LinkedList<Wf> wordsIDL = new LinkedList<Wf>();
2103 boolean spanTermFound = false;
2104 if (vars.globalTerms != null) {
2105
2106 for (Term termtmp : vars.globalTerms.getTerm()) {
2107 if (targetTermList.contains(termtmp)) {
2108 Iterator<Object> spansl = termtmp
2109 .getSentimentOrSpanOrExternalReferencesOrComponent().iterator();
2110 while (spansl.hasNext()) {
2111 Object spantmp = spansl.next();
2112 if (spantmp instanceof Span) {
2113 spanTermFound = true;
2114 for (Target targtmp : ((Span) spantmp).getTarget()) {
2115 wordsIDL.addLast((Wf) targtmp.getId());
2116 }
2117 }
2118 }
2119 }
2120 }
2121
2122 } else {
2123 Terms prop = vars.doc.getTerms();
2124
2125 for (Term termtmp : prop.getTerm()) {
2126 if (targetTermList.contains(termtmp)) {
2127 Iterator<Object> spansl = termtmp
2128 .getSentimentOrSpanOrExternalReferencesOrComponent()
2129 .iterator();
2130 while (spansl.hasNext()) {
2131 Object spantmp = spansl.next();
2132 if (spantmp instanceof Span) {
2133 spanTermFound = true;
2134 for (Target targtmp : ((Span) spantmp).getTarget()) {
2135 wordsIDL.addLast((Wf) targtmp.getId());
2136 }
2137 }
2138 }
2139 }
2140 }
2141
2142 }
2143
2144
2145
2146 if (vars.globalText != null) {
2147
2148 int found = 0;
2149 for (Wf wftmp : vars.globalText.getWf()) {
2150 if (wordsIDL.contains(wftmp)) {
2151 returned.addLast(wftmp);
2152 found++;
2153 }
2154 if (found >= wordsIDL.size()) {
2155 break;
2156 }
2157 }
2158
2159 if (found < wordsIDL.size()) {
2160 logWarn("Inconsistence NAF file(#SW): Wf(s) arenot found when loading term ", vars);
2161 }
2162
2163 } else {
2164 Text prop = vars.doc.getText();
2165 int found = 0;
2166 for (Wf wftmp : prop.getWf()) {
2167 if (wordsIDL.contains(wftmp)) {
2168 returned.addLast(wftmp);
2169 found++;
2170 }
2171 if (found >= wordsIDL.size()) {
2172 break;
2173 }
2174
2175 }
2176
2177 if (found < wordsIDL.size()) {
2178 logWarn("Inconsistence NAF file(#SW): Wf(s) arenot found when loading term ", vars);
2179 }
2180
2181 }
2182 return returned;
2183 }
2184
2185 private static LinkedList<Wf> fromSpanGetAllMentions(List<Target> list, processNAFVariables vars) {
2186
2187 LinkedList<Term> targetTermList = new LinkedList<Term>();
2188 Iterator<Target> targetList = list.iterator();
2189 while (targetList.hasNext()) {
2190 Target tarm = targetList.next();
2191 targetTermList.add((Term) tarm.getId());
2192 }
2193 LinkedList<Wf> corrispondingWf = getTheWFListByThereTermsFromTargetList(targetTermList, vars);
2194 return corrispondingWf;
2195 }
2196
2197 public static void readNAFFile(File naf, processNAFVariables vars) throws Exception {
2198
2199 try {
2200 JAXBContext jc = JAXBContext.newInstance("eu.fbk.knowledgestore.populator.naf.model");
2201 Unmarshaller unmarshaller = jc.createUnmarshaller();
2202 byte[] bytes = ByteStreams.toByteArray(IO.read(naf.getAbsolutePath()));
2203 vars.doc = (NAF) unmarshaller.unmarshal(new ByteArrayInputStream(bytes));
2204 } catch (UnsupportedEncodingException e) {
2205 e.printStackTrace();
2206 logError(e.getMessage(), vars);
2207 } catch (FileNotFoundException e) {
2208 e.printStackTrace();
2209 logError(e.getMessage(), vars);
2210 } catch (IOException e) {
2211 e.printStackTrace();
2212 logError(e.getMessage(), vars);
2213 } catch (JAXBException e) {
2214 e.printStackTrace();
2215 logError(e.getMessage(), vars);
2216 }
2217
2218 }
2219
2220 static void calculateMemory() {
2221 int mb = 1024 * 1024;
2222
2223
2224 Runtime runtime = Runtime.getRuntime();
2225
2226 System.err.println("##### Heap utilization statistics [MB] #####");
2227
2228
2229 System.err.println("Used Memory:" + (runtime.totalMemory() - runtime.freeMemory()) / mb);
2230
2231
2232 System.err.println("Free Memory:" + runtime.freeMemory() / mb);
2233
2234
2235 System.err.println("Total Memory:" + runtime.totalMemory() / mb);
2236
2237
2238 System.err.println("Max Memory:" + runtime.maxMemory() / mb);
2239 }
2240 }