Seminar für Sprachwissenschaft

XML Files


The distribution format of all GermaNet data is XML. The XML files represent all data that is available in the GermaNet database. There are two types of XML files. One type represents all synsets with their lexical units and their other properties. The other type represents all relations, both conceptual and lexical relations.

Synset Files

These files are organized around the three word categories currently considered in GermanNet: nouns, adjectives, and verbs. For each word category a predefined set of files exists which elements are named according to the table of semantic fields in GermaNet. Thus, these files are named wordCategory.wordClass.xml, e.g. adj.Allgemein.xml, nomen.Tops.xml, etc.

The synset files contain all synsets with their lexical units and all other properties, as the following example shows:

<synset id="ID" wordCategory="CATEGORY">
  <lexUnit id="ID"  orthVarOf="VAR" acceptable="ACC" sense="SE" source="SRC" namedEntity="NE" artificial="AR" styleMarking="SM">
    <orthForm>ORTHFORM</orthForm>
    <orthVar>ORTHVAR</orthVar>
    <oldOrthForm>OLDORTHFORM</oldOrthForm>
    <oldOrthVar>OLDORTHVAR</oldOrthVar>
    <frame>FRAME</frame>
    <example>
      <text>TEXT</text>
      <exframe>EXFRAME</exframe>
    </example>
    <compound>
      <modifier property="PROP" category="CAT">MODIFIER1</modifier>
      <modifier property="PROP" category="CAT">MODIFIER2</modifier>
      <head property="PROP">HEAD</head>
    </compound>
  </lexUnit>
  <paraphrase>PARAPHRASE</paraphrase>
</synset>

Document Type Definition (DTD) for Synset Files

<!ELEMENT synsets     (synset+)>
<!ELEMENT synset      (lexUnit+, paraphrase?)>
<!ATTLIST synset      id       ID                 #REQUIRED
                      category (adj|nomen|verben) #REQUIRED
                      class    (Allgemein|Bewegung|Gefuehl|Geist|
                               Gesellschaft|Koerper|Menge|natPhaenomen|
                               Ort|Pertonym|Perzeption|privativ|Relation|
                               Substanz|Verhalten|Zeit|Artefakt|Attribut|
                               Besitz|Form|Geschehen|Gruppe|Kognition|
                               Kommunikation|Mensch|Motiv|Nahrung|
                               natGegenstand|Pflanze|Tier|Tops|
                               Koerperfunktion|Konkurrenz|Kontakt|
                               Lokation|Schoepfung|Veraenderung|
                               Verbrauch) #REQUIRED>
<!ELEMENT lexUnit     (orthForm, orthVar?, oldOrthForm?, oldOrthVar?,
                      frame*, example*, compound?)>
<!ATTLIST lexUnit     id           ID       #REQUIRED
                      sense        CDATA    #REQUIRED
                      source       CDATA    #REQUIRED
                      namedEntity  (yes|no) #REQUIRED
                      artificial   (yes|no) #REQUIRED
                      styleMarking (yes|no) #REQUIRED>
<!ELEMENT orthForm    (#PCDATA)>
<!ELEMENT orthVar     (#PCDATA)>
<!ELEMENT oldOrthForm (#PCDATA)>
<!ELEMENT oldOrthVar  (#PCDATA)>
<!ELEMENT paraphrase  (#PCDATA)>
<!ELEMENT example     (text, exframe?)>
<!ELEMENT text        (#PCDATA)>
<!ELEMENT frame       (#PCDATA)>
<!ELEMENT exframe     (#PCDATA)>
<!ELEMENT compound    (modifier, modifier?, head)>
<!ELEMENT modifier    (#PCDATA)>
<!ATTLIST modifier    property (Abkürzung|Konfix|Fremdwort|Affixoid|
                               Wortgruppe|Eigenname|opaquesMorphem) #IMPLIED
                      category (Adjektiv|Nomen|Verb|Adverb|Präposition|
                               Partikel|Pronomen) #IMPLIED>
<!ELEMENT head        (#PCDATA)>
<!ATTLIST head        property (Abkürzung|Konfix|Fremdwort|Affixoid|
                               opaquesMorphem|virtuelleBildung) #IMPLIED>

Relation File

This type of XML file represents both kinds of relations: conceptual and lexical relations. All relations are encoded within one XML file, which is named gn_relations.xml.

<con_rel name="hyperonymy" from="ID" to="ID" dir="revert" inv="hyponymy" />
<lex_rel name="antonymy" from="ID" to="ID" dir="both" />

Document Type Definition (DTD) for Relation File

<!ELEMENT relations (lex_rel|con_rel)+>
<!ELEMENT lex_rel EMPTY>
<!ATTLIST lex_rel name 
      (has_antonym|has_participle|has_pertainym|has_active_usage|
      has_occasion|has_attribute|has_appearance|
      has_construction_method|has_container|is_container_for|
      has_consistency_of|has_component|has_owner|is_owner_of|has_function|
      has_manner_of_functioning|has_origin|has_production_method|
      has_content|has_no_property|has_habitat|has_location|is_location_of|
      has_measure|is_measure_of|has_material|has_member|is_member_of|
      has_diet|is_diet_of|has_eponym|has_user|has_product|is_product_of|
      has_prototypical_holder|is_prototypical_holder_for|
      has_prototypical_place_of_usage|has_relation|has_raw_product|
      has_other_property|is_storage_for|has_specialization|has_part|
      is_part_of|has_topic|is_caused_by|is_cause_for|is_comparable_to|
      has_usage|has_result_of_usage|has_purpose_of_usage|has_goods|
      has_time|is_access_to|has_ingredient|is_ingredient_of) #REQUIRED
                  dir  (one|both|revert) #REQUIRED
                  inv  CDATA #IMPLIED
                  from CDATA #REQUIRED
                  to   CDATA #REQUIRED>
<!ELEMENT con_rel EMPTY>
<!ATTLIST con_rel name (hyperonymy|meronymy|holonymy
        |entailment|causation|association) #REQUIRED
                  dir  (one|both|revert) #REQUIRED
                  inv  CDATA #IMPLIED
                  from CDATA #REQUIRED
                  to   CDATA #REQUIRED>

Interlingual Index File

All Interlingual Index data is stored in a single file interLingualIndex_DE-EN.xml. Each record is linked to a specific lexical unit, and contains information from the Princeton WordNet.

<iliRecord lexUnitId="ID" ewnRelation="RELATION"
          pwnWord="WORD" pwn20Sense="SENSE"
          pwn20Id="ID" pwn30Id="ID"
          pwn20paraphrase="PARAPHRASE" source="SOURCE" >
  <pwn20Synonyms>
    <pwn20Synonym>SYNONYM</pwn20Synonym>
    <pwn20Synonym>SYNONYM</pwn20Synonym>
  </pwn20Synonyms>
</iliRecord>

Document Type Definition (DTD) for Interlingual Index File

<!ELEMENT interLingualIndex (iliRecord+)>
<!ELEMENT iliRecord (pwn20Synonyms?)>
<!ATTLIST iliRecord lexUnitId   CDATA #REQUIRED
                    ewnRelation (be_in_state|causes
                        |has_holonym|has_hyperonym
                        |has_hyponym|has_meronym
                        |has_subevent|involved
                        |is_caused_by|is_subevent_of
                        |near_synonym|role|synonym
                        |xpos_near_synonym) #REQUIRED
                    pwnWord     CDATA #REQUIRED
                    pwn20Sense  CDATA #REQUIRED
                    pwn20Id     CDATA #REQUIRED
                    pwn30Id     CDATA #REQUIRED
                    pwn20paraphrase CDATA #IMPLIED
                    source (initial|extension1) #REQUIRED>

<!ELEMENT pwn20Synonyms (pwn20Synonym+)>
<!ELEMENT pwn20Synonym  (#PCDATA)>

Wiktionary Paraphrases Files

There's a total of three files containing Wiktionary paraphrases, containing words of different word categories (nouns, verbs, and adjectives). These files are named in the following manner: wiktionaryParaphrases-adj.xml, wiktionaryParaphrases-nomen.xml, wiktionaryParaphrases-verben.xml.

<wiktionaryParaphrase lexUnitId="ID" wiktionaryId="ID" 
      wiktionarySenseId="ID" wiktionarySense="PARAPHRASE"
      edited="EDITED" />

Each WiktionaryParaphrase, which has an identifier referring to the word in Wiktionary (wiktionaryId), a sense number (wiktionarySenseId), and a definition (wiktionarySense), is linked to a specific lexical unit in GermaNet (lexUnitId). The editedattribute specifies whether this specific Wiktionary definition is still in its original wording extracted from Wiktionary or if it was (slighly) modified.

Document Type Definition (DTD) for Wiktionary Paraphrases Files

<!ELEMENT wiktionaryParaphrases (wiktionaryParaphrase+)>
<!ELEMENT wiktionaryParaphrase EMPTY>
<!ATTLIST wiktionaryParaphrase
                   lexUnitId         CDATA #REQUIRED
                   wiktionaryId      CDATA #REQUIRED
                   wiktionarySenseId CDATA #REQUIRED
                   wiktionarySense   CDATA #REQUIRED
                   edited            (yes|no) #REQUIRED>