From 12a53dc1796035ab9588998e8349cf6fe1a1845f Mon Sep 17 00:00:00 2001 From: Sonja Huber <sonja.huber2@uzh.ch> Date: Thu, 30 Jan 2025 10:52:42 +0100 Subject: [PATCH 1/2] neue Metadaten option auf True gesetzt --- 2._Web-Crawling/scrape_websites.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/2._Web-Crawling/scrape_websites.py b/2._Web-Crawling/scrape_websites.py index b3230d1..a75bf86 100644 --- a/2._Web-Crawling/scrape_websites.py +++ b/2._Web-Crawling/scrape_websites.py @@ -46,7 +46,7 @@ def scrape_websites(file_url_list, file_output): # das Heruntergeladene wird von trafilatura brauchbar gemacht # Output-Format ist XML, Kommentare inkludiert, und nur Seiten auf Deutsch werden gescrapt - website_content_cleaned = extract(website_content, output_format='xml', include_comments=True, + website_content_cleaned = extract(website_content, with_metadata=True, output_format='xml', include_comments=True, target_language='de') if website_content_cleaned is None: # falls die Extraktion nicht ausgeführt werden kann -- GitLab From 6d11bbd330106dbada84130da432dbd1abe142de Mon Sep 17 00:00:00 2001 From: Sonja Huber <sonja.huber2@uzh.ch> Date: Thu, 30 Jan 2025 11:38:21 +0100 Subject: [PATCH 2/2] Converter Script now recognizes header tags automatically --- .../xml_converter.py | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/3._Tokenization_und_PoS_Tagging/xml_converter.py b/3._Tokenization_und_PoS_Tagging/xml_converter.py index ced7687..5a1e5c6 100644 --- a/3._Tokenization_und_PoS_Tagging/xml_converter.py +++ b/3._Tokenization_und_PoS_Tagging/xml_converter.py @@ -27,35 +27,35 @@ def xml_converter(in_file, out_file): corpus_root = promethia_root.getroot() for index, doc_element in enumerate(trafi_root.iter("doc")): - - # Create a 'text' element and add it to the 'corpus' + # Create 'text' element text = ET.SubElement(corpus_root, "text") - # Create a 'header' element and add it to 'text' + + # Create 'header' element header = ET.SubElement(text, "header") - - # Create 'id' and 'title' elements and add them to 'header' - id_element = ET.SubElement(header, "id") - id_element.text = str(index) - # Create 'date' element and add it to 'header' - date_element = ET.SubElement(header, "date") - if doc_element.get("date"): - date_element.text = doc_element.get("date") + # Get all attributes present in the current document + doc_attribs = doc_element.attrib.keys() - title_element = ET.SubElement(header, "title") - if doc_element.get("title"): - title_element.text = doc_element.get("title") + # Loop through all attributes dynamically + for field in doc_attribs: + element = ET.SubElement(header, field) + element.text = doc_element.get(field) - doc_content = doc_element.find("main") - # Create a 'body' element and add it to 'text' - body = ET.SubElement(text, "body") - body.text = ET.tostring(doc_content, method="text", encoding='Unicode') # Replace with your text content + # Manually add 'id' since it's not in the document attributes + id_element = ET.SubElement(header, "id") + id_element.text = str(index) - # Save the XML to a file - output_file_path = out_file - promethia_root.write( - output_file_path, pretty_print=True, xml_declaration=True, encoding="utf-8" - ) + # Create 'body' element + body = ET.SubElement(text, "body") + doc_content = doc_element.find("main") + if doc_content is not None: + body.text = ET.tostring(doc_content, method="text", encoding='Unicode') + + # Save the XML to a file + output_file_path = out_file + promethia_root.write( + output_file_path, pretty_print=True, xml_declaration=True, encoding="utf-8" + ) print(f"XML data saved to {output_file_path}") print("Done") -- GitLab