{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8ccf0641-0617-4ee1-b016-8c1bae67cc92",
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup as bs\n",
    "import json\n",
    "import jsonlines\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "49f64c37-5ef9-4717-aa2e-ee95037f4fa4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████| 44309/44309 [00:47<00:00, 926.29it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "44309\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "file = open('med.cd.az.xml', 'r', encoding = 'utf-8')\n",
    "content = file.readlines()\n",
    "bs_contents = []\n",
    "for line in tqdm(content):\n",
    "    bs_contents.append(bs(line, 'xml'))\n",
    "print(len(bs_contents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96b1f564-9a91-4249-831d-ee7ba1220a2e",
   "metadata": {},
   "outputs": [],
   "source": [
    "for bs_content in tqdm(bs_contents):\n",
    "    word_dict = {}\n",
    "    for entry in bs_content.find_all('HOMOGRAPH'):\n",
    "        # tip1: view what tags each entry may contain before proceeding.\n",
    "        # tip2: not all entries have all the attributes. BeautifulSoup returns `None` when it finds nothing.\n",
    "        # Note that some tags might not look exactly like those in the output example \n",
    "            # (e.g. SENSE45, TRANSLATION). \n",
    "            # So please remember to change the tag names as required when you save them to your own dict.\n",
    "        ...\n",
    "        if entry.find('PART-OF-SPEECH') != None:\n",
    "            ...\n",
    "        if entry.find('SENSE45') != None:\n",
    "            sense_list = []\n",
    "            for sense in entry.find_all('SENSE45'):\n",
    "                sense_dict = {}\n",
    "                # DEFINITION\n",
    "                if sense.find('DEFINITION') != None:\n",
    "                    sense_dict['DEFINITION'] = sense.find('DEFINITION').get_text()\n",
    "                    ...\n",
    "                ...\n",
    "                \n",
    "        if entry.find('SENSE') != None:\n",
    "            ...\n",
    "        if entry.find('PHRASE') != None:\n",
    "            ..."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0823768e-a9e4-4dc3-a135-9745a1a6f29f",
   "metadata": {},
   "source": [
    "### Output example\n",
    "`my_dict['record1']`\n",
    "```json\n",
    "{\n",
    "    'ID': '284721',\n",
    "    'PART-OF-SPEECH': 'noun',\n",
    "    'PHRASE': [\n",
    "               {'DEFINITION': '...',\n",
    "                'EXAMPLES': ...,\n",
    "                'MULTIWORD': '...',\n",
    "                'TRANSLATION': '...'},\n",
    "                ...\n",
    "               ],\n",
    "            \n",
    "     'SENSE': [\n",
    "               {'DEFINITION': '...',\n",
    "                'EXAMPLES': ...,\n",
    "                'SUB-SENSE': [{'DEFINITION': '...',\n",
    "                               'EXAMPLES': ...,\n",
    "                               'TRANSLATION': '...'}],\n",
    "                'TRANSLATION': '...'},\n",
    "                ...\n",
    "               ]\n",
    "}\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "166d75bd-dacf-4efb-b90f-cb5bc9a37586",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}