Skip to content
Snippets Groups Projects
Commit 54bb99f6 authored by Sonja Huber's avatar Sonja Huber
Browse files

Merge branch 'update_extr_comments' into 'master'

adapt extract_comments.py to update on xml_extractor.py

See merge request !15
parents 07da08dc 3f56020d
No related branches found
No related tags found
1 merge request!15adapt extract_comments.py to update on xml_extractor.py
......@@ -3,7 +3,7 @@ Skript um aus einem Trafilatura-XML-File die Kommentare passend für Promethia z
Input:
1. Name der Inputdatei im XML-Format, von Trafilatura gescrapt
2. Name der Outputdatei im XML-Format
2. Name der Outputdatei im XML-Format, kann von Promethia verarbeitet werden
Output:
XML-Datei mit den Kommentaren in passendem Format für Promethia.
......@@ -11,91 +11,55 @@ Falls schon etwas in der Datei steht, wird der Inhalt per default überschrieben
"""
import argparse
import re
from xml_converter import check_for_tags
def write_comments(out_file, id_count, title, comments):
"""Schreibe Kommentare in das (bereits geöffnete) Outputfile"""
out_file.write("\t<text>\n")
out_file.write("\t\t<header>\n")
out_file.write("\t\t\t<id>" + str(id_count) + "</id>\n")
out_file.write("\t\t\t<title>" + title.group() + "</title>\n")
out_file.write("\t\t</header>\n")
out_file.write("\t\t<body>\n")
for line in comments:
out_file.write("\t\t\t" + check_for_tags(line) + "\n")
out_file.write("\t\t</body>\n")
out_file.write("\t</text>\n")
#from xml_converter import check_for_tags
import lxml.etree as ET
def extract_comments(in_file, out_file):
# Input- und Outputfile werden geöffnet
with open(out_file, "w", encoding="utf-8") as f_out:
# Start des xml-files
f_out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
f_out.write("<corpus>\n")
with open(in_file, "r", encoding="utf-8") as f_in:
id_count = 1
meta_line = f_in.readline()
# Bearbeite eine Seite
while meta_line is not None:
if meta_line.strip() == "":
break
if meta_line.strip() == "</doc>":
break
# Extrahiere Seitentitel
site_title = re.search("(?<=title=\")[^\"|»]+", meta_line)
# Navigiere zu den Kommentaren
no_comments = False
while True:
line = f_in.readline()
if "<comments>" in line:
# Kommentare erreicht
break
elif "<comments/>" in line:
# Keine Kommentare
no_comments = True
break
# wenn die Seite keine Kommentare hat, gehe zur nächsten Seite
if no_comments:
meta_line = f_in.readline()
continue
# Lese Kommentare
comments = []
line = f_in.readline()
while "</comments>" not in line: # Ende erreicht?
# XXX: momentan wird die von Trafilatura generierte Anzahl Kommentare ignoriert
if "</head>" in line:
# Ersetze den String "X Comments" damit er später nicht das Korpus dominiert
line.replace(line, "")
else:
comments.append(check_for_tags(line.strip()))
line = f_in.readline()
# Gebe Kommentare aus
write_comments(f_out, id_count, site_title, comments)
# Gehe zur nächsten Seite
id_count += 1
meta_line = f_in.readline()
f_out.write("</corpus>")
print("printed no2;id =" + str(id_count))
# Parse the XML file
trafi_tree = ET.parse(in_file)
# Get the root element
trafi_root = trafi_tree.getroot()
# Create a new ElementTree
promethia_root = ET.ElementTree(ET.Element("corpus"))
# Get the root element
corpus_root = promethia_root.getroot()
for index, doc_element in enumerate(trafi_root.iter("doc")):
# if comment exists, make a entry
doc_content = doc_element.find("comments")
test_string = ET.tostring(doc_content, method="text", encoding='Unicode')
if test_string is not '\n':
# Create a 'text' element and add it to the 'corpus'
text = ET.SubElement(corpus_root, "text")
# Create a 'header' element and add it to 'text'
header = ET.SubElement(text, "header")
# Create 'id' and 'title' elements and add them to 'header'
id_element = ET.SubElement(header, "id")
id_element.text = str(index) # Replace with your ID value
doc_title = doc_element.get("title")
title_element = ET.SubElement(header, "title")
title_element.text = doc_title # Replace with your title value
# Create a 'body' element and add it to 'text'
body = ET.SubElement(text, "body")
body.text = ET.tostring(doc_content, method="text", encoding='Unicode') # Replace with your text content
print(f'&/{body.text}/&')
# Save the XML to a file
output_file_path = out_file
promethia_root.write(
output_file_path, pretty_print=True, xml_declaration=True, encoding="utf-8"
)
print(f"XML data saved to {output_file_path}")
print("Done")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Kommentar-Extraktor")
parser.add_argument("in_file", type=str, help="Input-Datei (als Trafilatura-XML). Enthält alles.")
parser.add_argument("out_file", type=str, help="Output-Datei (als Trafilatura-XML). Enthält nur Kommentare.")
parser.add_argument("out_file", type=str, help="Output-Datei (als Promethia-XML). Enthält nur Kommentare.")
args = parser.parse_args()
extract_comments(args.in_file, args.out_file)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment