{ "cells": [ { "cell_type": "code", "execution_count": 8, "id": "8ccf0641-0617-4ee1-b016-8c1bae67cc92", "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup as bs\n", "import json\n", "import jsonlines\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 9, "id": "49f64c37-5ef9-4717-aa2e-ee95037f4fa4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|████████████████████████████████████████████████████████████████████| 44309/44309 [00:47<00:00, 926.29it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "44309\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "file = open('med.cd.az.xml', 'r', encoding = 'utf-8')\n", "content = file.readlines()\n", "bs_contents = []\n", "for line in tqdm(content):\n", " bs_contents.append(bs(line, 'xml'))\n", "print(len(bs_contents))" ] }, { "cell_type": "code", "execution_count": null, "id": "96b1f564-9a91-4249-831d-ee7ba1220a2e", "metadata": {}, "outputs": [], "source": [ "for bs_content in tqdm(bs_contents):\n", " word_dict = {}\n", " for entry in bs_content.find_all('HOMOGRAPH'):\n", " # tip1: view what tags each entry may contain before proceeding.\n", " # tip2: not all entries have all the attributes. BeautifulSoup returns `None` when it finds nothing.\n", " # Note that some tags might not look exactly like those in the output example \n", " # (e.g. SENSE45, TRANSLATION). \n", " # So please remember to change the tag names as required when you save them to your own dict.\n", " ...\n", " if entry.find('PART-OF-SPEECH') != None:\n", " ...\n", " if entry.find('SENSE45') != None:\n", " sense_list = []\n", " for sense in entry.find_all('SENSE45'):\n", " sense_dict = {}\n", " # DEFINITION\n", " if sense.find('DEFINITION') != None:\n", " sense_dict['DEFINITION'] = sense.find('DEFINITION').get_text()\n", " ...\n", " ...\n", " \n", " if entry.find('SENSE') != None:\n", " ...\n", " if entry.find('PHRASE') != None:\n", " ..." ] }, { "cell_type": "markdown", "id": "0823768e-a9e4-4dc3-a135-9745a1a6f29f", "metadata": {}, "source": [ "### Output example\n", "`my_dict['record1']`\n", "```json\n", "{\n", " 'ID': '284721',\n", " 'PART-OF-SPEECH': 'noun',\n", " 'PHRASE': [\n", " {'DEFINITION': '...',\n", " 'EXAMPLES': ...,\n", " 'MULTIWORD': '...',\n", " 'TRANSLATION': '...'},\n", " ...\n", " ],\n", " \n", " 'SENSE': [\n", " {'DEFINITION': '...',\n", " 'EXAMPLES': ...,\n", " 'SUB-SENSE': [{'DEFINITION': '...',\n", " 'EXAMPLES': ...,\n", " 'TRANSLATION': '...'}],\n", " 'TRANSLATION': '...'},\n", " ...\n", " ]\n", "}\n", "```" ] }, { "cell_type": "code", "execution_count": null, "id": "166d75bd-dacf-4efb-b90f-cb5bc9a37586", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" } }, "nbformat": 4, "nbformat_minor": 5 }