1. Über python
Linguistische Datenverarbeitung mit python
2. Beispiel: Parsing eines XML-Dokuments und Ausgabe aller <p>-Inhalte
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import xml.etree.ElementTree as etree f = open('greif.csv', 'w') #PARSING tree = etree.parse('greif.xml') root = tree.getroot() #DEBUG: OUTPUT TAGS for x in root.iter(): print x.tag + ' = ' + (x.text if x.text else '') #AUTOR-EXTRAKTION for x in root.iter('{http://www.tei-c.org/ns/1.0}author'): author = x.text.replace('\n','').strip() #TITEL-EXTRAKTION for x in root.iter('{http://www.tei-c.org/ns/1.0}head'): title = x.text.replace('\n','').strip() #TEXT-EXTRAKTION for x in root.iter('{http://www.tei-c.org/ns/1.0}p'): if '{http://www.w3.org/XML/1998/namespace}id' in x.attrib: text = "".join(x.itertext()) #TOKENISIERUNG text = re.sub('([\.,:;!\?\-\'\"])',r' \1',text) tokens = text.split() #AUSGABE for token in tokens: output = author+'\t'+title+'\t'+token+'\n' output = output.encode('utf-8') f.write(output) f.close()