adapt extract_comments.py to update on xml_extractor.py

3f56020d · Sonja Huber · 07da08dc · 3f56020d
Commit 3f56020d authored 1 year ago by Sonja Huber
--- a/3._Tokenization_und_PoS_Tagging/extract_comments.py
+++ b/3._Tokenization_und_PoS_Tagging/extract_comments.py
@@ -3,7 +3,7 @@ Skript um aus einem Trafilatura-XML-File die Kommentare passend für Promethia z

 Input:
 1. Name der Inputdatei im XML-Format, von Trafilatura gescrapt
-2. Name der Outputdatei im XML-Format
+2. Name der Outputdatei im XML-Format, kann von Promethia verarbeitet werden

 Output:
 XML-Datei mit den Kommentaren in passendem Format für Promethia.
@@ -11,91 +11,55 @@ Falls schon etwas in der Datei steht, wird der Inhalt per default überschrieben
 """
 import argparse
 import re
-from xml_converter import check_for_tags
-
-
-def write_comments(out_file, id_count, title, comments):
-    """Schreibe Kommentare in das (bereits geöffnete) Outputfile"""
-    out_file.write("\t<text>\n")
-    out_file.write("\t\t<header>\n")
-    out_file.write("\t\t\t<id>" + str(id_count) + "</id>\n")
-    out_file.write("\t\t\t<title>" + title.group() + "</title>\n")
-    out_file.write("\t\t</header>\n")
-    out_file.write("\t\t<body>\n")
-
-    for line in comments:
-        out_file.write("\t\t\t" + check_for_tags(line) + "\n")
-
-    out_file.write("\t\t</body>\n")
-    out_file.write("\t</text>\n")
-
+#from xml_converter import check_for_tags
+import lxml.etree as ET

 def extract_comments(in_file, out_file):
-    # Input- und Outputfile werden geöffnet
-    with open(out_file, "w", encoding="utf-8") as f_out:
-        # Start des xml-files
-        f_out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
-        f_out.write("<corpus>\n")
-
-        with open(in_file, "r", encoding="utf-8") as f_in:
-            id_count = 1
-            meta_line = f_in.readline()
-
-            # Bearbeite eine Seite
-            while meta_line is not None:
-                if meta_line.strip() == "":
-                    break
-                if meta_line.strip() == "</doc>":
-                    break
-
-                # Extrahiere Seitentitel
-                site_title = re.search("(?<=title=\")[^\"|»]+", meta_line)
-
-                # Navigiere zu den Kommentaren
-                no_comments = False
-                while True:
-                    line = f_in.readline()
-                    if "<comments>" in line:
-                        # Kommentare erreicht
-                        break
-                    elif "<comments/>" in line:
-                        # Keine Kommentare
-                        no_comments = True
-                        break
-
-                # wenn die Seite keine Kommentare hat, gehe zur nächsten Seite
-                if no_comments:
-                    meta_line = f_in.readline()
-                    continue
-
-                # Lese Kommentare
-                comments = []
-                line = f_in.readline()
-                while "</comments>" not in line:  # Ende erreicht?
-                    # XXX: momentan wird die von Trafilatura generierte Anzahl Kommentare ignoriert
-                    if "</head>" in line:
-                        # Ersetze den String "X Comments" damit er später nicht das Korpus dominiert
-                        line.replace(line, "")
-                    else:
-                        comments.append(check_for_tags(line.strip()))
-
-                    line = f_in.readline()
-
-                # Gebe Kommentare aus
-                write_comments(f_out, id_count, site_title, comments)
-
-                # Gehe zur nächsten Seite
-                id_count += 1
-                meta_line = f_in.readline()
-
-        f_out.write("</corpus>")
-        print("printed no2;id =" + str(id_count))
+    # Parse the XML file
+    trafi_tree = ET.parse(in_file)
+    # Get the root element
+    trafi_root = trafi_tree.getroot()
+
+    # Create a new ElementTree
+    promethia_root = ET.ElementTree(ET.Element("corpus"))
+    # Get the root element
+    corpus_root = promethia_root.getroot()
+
+    for index, doc_element in enumerate(trafi_root.iter("doc")):
+        # if comment exists, make a entry
+        doc_content = doc_element.find("comments")
+        test_string = ET.tostring(doc_content, method="text", encoding='Unicode')
+        if test_string is not '\n':
+            # Create a 'text' element and add it to the 'corpus'
+            text = ET.SubElement(corpus_root, "text")
+            # Create a 'header' element and add it to 'text'
+            header = ET.SubElement(text, "header")
+            # Create 'id' and 'title' elements and add them to 'header'
+            id_element = ET.SubElement(header, "id")
+            id_element.text = str(index)  # Replace with your ID value
+
+            doc_title = doc_element.get("title")
+
+            title_element = ET.SubElement(header, "title")
+            title_element.text = doc_title  # Replace with your title value
+
+            # Create a 'body' element and add it to 'text'
+            body = ET.SubElement(text, "body")
+            body.text = ET.tostring(doc_content, method="text", encoding='Unicode')  # Replace with your text content
+            print(f'&/{body.text}/&')
+
+    # Save the XML to a file
+    output_file_path = out_file
+    promethia_root.write(
+        output_file_path, pretty_print=True, xml_declaration=True, encoding="utf-8"
+    )
+    print(f"XML data saved to {output_file_path}")
    print("Done")


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Kommentar-Extraktor")
    parser.add_argument("in_file", type=str, help="Input-Datei (als Trafilatura-XML). Enthält alles.")
-    parser.add_argument("out_file", type=str, help="Output-Datei (als Trafilatura-XML). Enthält nur Kommentare.")
+    parser.add_argument("out_file", type=str, help="Output-Datei (als Promethia-XML). Enthält nur Kommentare.")
    args = parser.parse_args()
    extract_comments(args.in_file, args.out_file)