From 12a53dc1796035ab9588998e8349cf6fe1a1845f Mon Sep 17 00:00:00 2001
From: Sonja Huber <sonja.huber2@uzh.ch>
Date: Thu, 30 Jan 2025 10:52:42 +0100
Subject: [PATCH 1/2] neue Metadaten option auf True gesetzt

---
 2._Web-Crawling/scrape_websites.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/2._Web-Crawling/scrape_websites.py b/2._Web-Crawling/scrape_websites.py
index b3230d1..a75bf86 100644
--- a/2._Web-Crawling/scrape_websites.py
+++ b/2._Web-Crawling/scrape_websites.py
@@ -46,7 +46,7 @@ def scrape_websites(file_url_list, file_output):
 
             # das Heruntergeladene wird von trafilatura brauchbar gemacht
             # Output-Format ist XML, Kommentare inkludiert, und nur Seiten auf Deutsch werden gescrapt
-            website_content_cleaned = extract(website_content, output_format='xml', include_comments=True,
+            website_content_cleaned = extract(website_content, with_metadata=True, output_format='xml', include_comments=True,
                                               target_language='de')
 
             if website_content_cleaned is None:  # falls die Extraktion nicht ausgefÃ¼hrt werden kann
-- 
GitLab


From 6d11bbd330106dbada84130da432dbd1abe142de Mon Sep 17 00:00:00 2001
From: Sonja Huber <sonja.huber2@uzh.ch>
Date: Thu, 30 Jan 2025 11:38:21 +0100
Subject: [PATCH 2/2] Converter Script now recognizes header tags automatically

---
 .../xml_converter.py                          | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/3._Tokenization_und_PoS_Tagging/xml_converter.py b/3._Tokenization_und_PoS_Tagging/xml_converter.py
index ced7687..5a1e5c6 100644
--- a/3._Tokenization_und_PoS_Tagging/xml_converter.py
+++ b/3._Tokenization_und_PoS_Tagging/xml_converter.py
@@ -27,35 +27,35 @@ def xml_converter(in_file, out_file):
     corpus_root = promethia_root.getroot()
 
     for index, doc_element in enumerate(trafi_root.iter("doc")):
-
-        # Create a 'text' element and add it to the 'corpus'
+        # Create 'text' element
         text = ET.SubElement(corpus_root, "text")
-        # Create a 'header' element and add it to 'text'
+
+        # Create 'header' element
         header = ET.SubElement(text, "header")
-        
-        # Create 'id' and 'title' elements and add them to 'header'
-        id_element = ET.SubElement(header, "id")
-        id_element.text = str(index) 
 
-        # Create 'date' element and add it to 'header'
-        date_element = ET.SubElement(header, "date")
-        if doc_element.get("date"):
-            date_element.text = doc_element.get("date")
+        # Get all attributes present in the current document
+        doc_attribs = doc_element.attrib.keys()
 
-        title_element = ET.SubElement(header, "title")
-        if doc_element.get("title"):
-            title_element.text = doc_element.get("title")
+        # Loop through all attributes dynamically
+        for field in doc_attribs:
+            element = ET.SubElement(header, field)
+            element.text = doc_element.get(field)
 
-        doc_content = doc_element.find("main")
-        # Create a 'body' element and add it to 'text'
-        body = ET.SubElement(text, "body")
-        body.text = ET.tostring(doc_content, method="text", encoding='Unicode')  # Replace with your text content
+        # Manually add 'id' since it's not in the document attributes
+        id_element = ET.SubElement(header, "id")
+        id_element.text = str(index)
 
-    # Save the XML to a file
-    output_file_path = out_file
-    promethia_root.write(
-        output_file_path, pretty_print=True, xml_declaration=True, encoding="utf-8"
-    )
+        # Create 'body' element
+        body = ET.SubElement(text, "body")
+        doc_content = doc_element.find("main")
+        if doc_content is not None:
+            body.text = ET.tostring(doc_content, method="text", encoding='Unicode')
+            
+        # Save the XML to a file
+        output_file_path = out_file
+        promethia_root.write(
+            output_file_path, pretty_print=True, xml_declaration=True, encoding="utf-8"
+        )
     print(f"XML data saved to {output_file_path}")
     print("Done")
 
-- 
GitLab