| import numpy as np |
| from tqdm import tqdm |
| import pandas as pd |
| import os, sqlite3, traceback, ast, requests, fasttext, re, time, string, spacy, pysbd |
| from requests.exceptions import ReadTimeout, TooManyRedirects, ConnectionError, ConnectTimeout, InvalidSchema, InvalidURL |
| from qwikidata.linked_data_interface import get_entity_dict_from_api |
| from datetime import datetime |
| import utils.wikidata_utils as wdutils |
| from importlib import reload |
| from urllib.parse import urlparse, unquote |
| from urllib import parse |
| from bs4 import BeautifulSoup |
| from IPython.display import clear_output |
| from os.path import exists |
| from pathlib import Path |
| from nltk.tokenize import sent_tokenize |
| from sentence_splitter import SentenceSplitter, split_text_into_sentences |
| import nltk |
| nltk.download('punkt') |
|
|
| class DatabaseExtractor(): |
| def __init__(self, dbname='wikidata_claims_refs_parsed.db'): |
| self.dbname = dbname |
| self.prepare_extraction() |
| |
| def finish_extraction(self): |
| self.db.commit() |
| |
| def prepare_extraction(self): |
| self.db = sqlite3.connect(self.dbname) |
| self.cursor = self.db.cursor() |
|
|
| self.cursor.execute(''' |
| CREATE TABLE IF NOT EXISTS claims( |
| entity_id TEXT, |
| claim_id TEXT, |
| rank TEXT, |
| property_id TEXT, |
| datatype TEXT, |
| datavalue TEXT, |
| PRIMARY KEY ( |
| claim_id |
| ) |
| )''') |
|
|
| self.cursor.execute(''' |
| CREATE TABLE IF NOT EXISTS claims_refs( |
| claim_id TEXT, |
| reference_id TEXT, |
| PRIMARY KEY ( |
| claim_id, |
| reference_id |
| ) |
| )''') |
|
|
| self.cursor.execute(''' |
| CREATE TABLE IF NOT EXISTS refs( |
| reference_id TEXT, |
| reference_property_id TEXT, |
| reference_index TEXT, |
| reference_datatype TEXT, |
| reference_value TEXT, |
| PRIMARY KEY ( |
| reference_id, |
| reference_property_id, |
| reference_index |
| ) |
| )''') |
| self.db.commit() |
| |
| def extract_claim(self, entity_id, claim): |
| if claim['mainsnak']['snaktype'] == 'value': |
| value = str(claim['mainsnak']['datavalue']) |
| else: |
| value = claim['mainsnak']['snaktype'] |
| try: |
| self.cursor.execute(''' |
| INSERT INTO claims(entity_id, claim_id, rank, property_id, datatype, datavalue) |
| VALUES($var,$var,$var,$var,$var,$var)'''.replace('$var','?'), ( |
| entity_id,claim['id'],claim['rank'], |
| claim['mainsnak']['property'],claim['mainsnak']['datatype'],value |
| )) |
| except UnicodeEncodeError: |
| print(entity_id,claim['id'],claim['rank'], |
| claim['mainsnak']['property'],claim['mainsnak']['datatype'],value) |
| raise |
| except sqlite3.IntegrityError as err: |
| |
| self.cursor.execute( |
| '''SELECT * |
| FROM claims |
| WHERE claim_id=$var |
| '''.replace('$var','?'), (claim['id'],) |
| ) |
| conflicted_value = self.cursor.fetchone() |
| if conflicted_value == (entity_id,claim['id'],claim['rank'], |
| claim['mainsnak']['property'],claim['mainsnak']['datatype'],value): |
| pass |
| else: |
| print(err, claim['id']) |
| traceback.print_exc() |
| raise err |
| finally: |
| |
| pass |
|
|
| def extract_reference(self, ref): |
| for snaks in ref['snaks'].values(): |
| for i, snak in enumerate(snaks): |
| if snak['snaktype'] == 'value': |
| value = str(snak['datavalue']) |
| else: |
| value = snak['snaktype'] |
| try: |
| self.cursor.execute(''' |
| INSERT INTO refs(reference_id, reference_property_id, reference_index, |
| reference_datatype, reference_value) |
| VALUES($var,$var,$var,$var,$var)'''.replace('$var','?'), ( |
| ref['hash'],snak['property'],str(i),snak['datatype'],value |
| )) |
| except sqlite3.IntegrityError as err: |
| |
| self.cursor.execute( |
| '''SELECT reference_id, reference_property_id, reference_datatype, reference_value |
| FROM refs |
| WHERE reference_id = $var |
| AND reference_property_id = $var |
| '''.replace('$var','?'), (ref['hash'],snak['property']) |
| ) |
| conflicted_values = self.cursor.fetchall() |
| if (ref['hash'],snak['property'],snak['datatype'],value) in conflicted_values: |
| pass |
| else: |
| print(err, ref['hash'],snak['property'],i) |
| print('trying to insert:',(ref['hash'],snak['property'],str(i),snak['datatype'],value)) |
| traceback.print_exc() |
| raise err |
| finally: |
| |
| pass |
| |
| def extract_claim_reference(self, claim, ref): |
| claim['id'],ref['hash'] |
| try: |
| self.cursor.execute(''' |
| INSERT INTO claims_refs(claim_id, reference_id) |
| VALUES($var,$var)'''.replace('$var','?'), ( |
| claim['id'],ref['hash'] |
| )) |
| except sqlite3.IntegrityError as err: |
| |
| pass |
| finally: |
| |
| pass |
| |
| def extract_entity(self, e): |
| for outgoing_property_id in e['claims'].values(): |
| for claim in outgoing_property_id: |
| self.extract_claim(e['id'],claim) |
| if 'references' in claim: |
| for ref in claim['references']: |
| self.extract_claim_reference(claim, ref) |
| self.extract_reference(ref) |
|
|
| def claimParser(QID): |
| entity_id = QID |
| print('Setting up database ...') |
| extractor = DatabaseExtractor() |
|
|
| print('Fetching entity from API ...') |
| entity = get_entity_dict_from_api(entity_id) |
|
|
| if entity: |
| print(f'Parsing entity: {entity_id}') |
| extractor.extract_entity(entity) |
| else: |
| print(f'Failed to fetch entity: {entity_id}') |
|
|
| extractor.finish_extraction() |
|
|
| def propertyFiltering(QID): |
| reload(wdutils) |
| DB_PATH = 'wikidata_claims_refs_parsed.db' |
| claims_columns = ['entity_id','claim_id','rank','property_id','datatype','datavalue'] |
|
|
| properties_to_remove = { |
| 'general':[ |
| 'P31', |
| 'P279', |
| 'P373', |
| 'P910', |
| 'P7561', |
| 'P5008', |
| 'P2670', |
| 'P1740', |
| 'P1612', |
| 'P8989', |
| 'P2959', |
| 'P7867', |
| 'P935' , |
| 'P1472', |
| 'P8596', |
| 'P5105', |
| 'P8933', |
| 'P642', |
| 'P3876', |
| 'P1791', |
| 'P7084', |
| 'P1465', |
| 'P1687', |
| 'P6104', |
| 'P4195', |
| 'P1792', |
| 'P5869', |
| 'P1659', |
| 'P1464', |
| 'P2354', |
| 'P1424', |
| 'P7782', |
| 'P179', |
| 'P7888', |
| 'P6365', |
| 'P8464', |
| 'P360', |
| 'P805', |
| 'P8703', |
| 'P1456', |
| 'P1012', |
| 'P1151', |
| 'P2490', |
| 'P593', |
| 'P8744', |
| 'P2614', |
| 'P2184', |
| 'P9241', |
| 'P487', |
| 'P1754', |
| 'P2559', |
| 'P2517', |
| 'P971', |
| 'P6112', |
| 'P4224', |
| 'P301', |
| 'P1753', |
| 'P1423', |
| 'P1204', |
| 'P3921', |
| 'P1963', |
| 'P5125', |
| 'P3176', |
| 'P8952', |
| 'P2306', |
| 'P5193', |
| 'P5977', |
| ], |
| 'specific': {} |
| } |
|
|
| db = sqlite3.connect(DB_PATH) |
| cursor = db.cursor() |
| |
| sql_query = "select count(*) from claims where property_id in $1;" |
| sql_query = sql_query.replace('$1', '(' + ','.join([('"' + e + '"') for e in properties_to_remove['general']]) + ')') |
| cursor.execute(sql_query) |
| print('Removing the',len(properties_to_remove['general']),'properties deemed as ontological or unverbalisable') |
| cursor = db.cursor() |
|
|
| sql_query = "select * from claims where entity_id in $1;" |
| sql_query = sql_query.replace('$1', '(' + ','.join([('"' + e + '"') for e in [QID]]) + ')') |
|
|
| cursor.execute(sql_query) |
| theme_df = pd.DataFrame(cursor.fetchall()) |
| theme_df.columns = claims_columns |
|
|
| original_theme_df_size = theme_df.shape[0] |
| last_stage_theme_df_size = original_theme_df_size |
|
|
| print('- Removing deprecated') |
|
|
| |
| theme_df = theme_df[theme_df['rank'] != 'deprecated'].reset_index(drop=True) |
| print( |
| ' - Percentage of deprecated:', |
| round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%' |
| ) |
| last_stage_theme_df_size = theme_df.shape[0] |
|
|
| print('- Removing bad datatypes') |
|
|
| |
| bad_datatypes = ['commonsMedia','external-id','globe-coordinate','url', 'wikibase-form', |
| 'geo-shape', 'math', 'musical-notation', 'tabular-data', 'wikibase-sense'] |
| theme_df = theme_df[ |
| theme_df['datatype'].apply( |
| lambda x : x not in bad_datatypes |
| ) |
| ].reset_index(drop=True) |
| print( |
| ' - Percentage of bad datatypes:', |
| round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%' |
| ) |
| last_stage_theme_df_size = theme_df.shape[0] |
|
|
| print('- Removing bad properties') |
|
|
| |
| theme_df = theme_df[ |
| theme_df['property_id'].apply( |
| lambda x : (x not in properties_to_remove['general'])) |
| |
| ].reset_index(drop=True) |
| print( |
| ' - Percentage of ontology (non-domain) properties:', |
| round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%' |
| ) |
| last_stage_theme_df_size = theme_df.shape[0] |
|
|
| print('- Removing somevalue/novalue') |
|
|
| |
| theme_df = theme_df[ |
| theme_df['datavalue'].apply( |
| lambda x : x not in ['somevalue', 'novalue'] |
| ) |
| ].reset_index(drop=True) |
| print( |
| ' - Percentage of somevalue/novalue:', |
| round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%' |
| ) |
| last_stage_theme_df_size = theme_df.shape[0] |
|
|
| print( |
| 'After all removals, we keep', |
| round(last_stage_theme_df_size/original_theme_df_size*100, 2), |
| ) |
| theme_df.to_sql('claims', db, if_exists='replace', index=False) |
|
|
| return theme_df |
|
|
| def get_object_label_given_datatype(row): |
| Wd_API = wdutils.CachedWikidataAPI() |
| Wd_API.languages = ['en'] |
| def turn_to_century_or_millennium(y, mode): |
| y = str(y) |
| if mode == 'C': |
| div = 100 |
| group = int(y.rjust(3, '0')[:-2]) |
| mode_name = 'century' |
| elif mode == 'M': |
| div = 1000 |
| group = int(y.rjust(4, '0')[:-3]) |
| mode_name = 'millenium' |
| else: |
| raise ValueError('Use mode = C for century and M for millennium') |
| |
| if int(y)%div != 0: |
| group += 1 |
| group = str(group) |
|
|
| group_suffix = ( |
| 'st' if group[-1] == '1' else ( |
| 'nd' if group[-1] == '2' else ( |
| 'rd' if group[-1] == '3' else 'th' |
| ) |
| ) |
| ) |
|
|
| return ' '.join([group+group_suffix, mode_name]) |
|
|
| dt = row['datatype'] |
| dv = row['datavalue'] |
| |
| dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string'] |
| if dt not in dt_types: |
| print(dt) |
| raise ValueError |
| else: |
| try: |
| if dt == dt_types[0]: |
| return Wd_API.get_label(ast.literal_eval(dv)['value']['id'], True) |
| elif dt == dt_types[1]: |
| dv = ast.literal_eval(dv) |
| return (dv['value']['text'], dv['value']['language']) |
| elif dt == dt_types[2]: |
| dv = ast.literal_eval(dv) |
| amount, unit = dv['value']['amount'], dv['value']['unit'] |
| if amount[0] == '+': |
| amount = amount[1:] |
| if str(unit) == '1': |
| return (str(amount), 'en') |
| else: |
| unit_entity_id = unit.split('/')[-1] |
| unit = Wd_API.get_label(unit_entity_id, True) |
| return (' '.join([amount, unit[0]]), unit[1]) |
| elif dt == dt_types[3]: |
| dv = ast.literal_eval(dv) |
| time = dv['value']['time'] |
| timezone = dv['value']['timezone'] |
| precision = dv['value']['precision'] |
| assert dv['value']['after'] == 0 and dv['value']['before'] == 0 |
|
|
| sufix = 'BC' if time[0] == '-' else '' |
| time = time[1:] |
|
|
| if precision == 11: |
| return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%d/%m/%Y') + sufix, 'en') |
| elif precision == 10: |
| try: |
| return (datetime.strptime(time, '%Y-%m-00T00:00:%SZ').strftime("%B of %Y") + sufix, 'en') |
| except ValueError: |
| return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime("%B of %Y") + sufix, 'en') |
| elif precision == 9: |
| try: |
| return (datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y') + sufix, 'en') |
| except ValueError: |
| return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%Y') + sufix, 'en') |
| elif precision == 8: |
| try: |
| return (datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y')[:-1] +'0s' + sufix, 'en') |
| except ValueError: |
| return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%Y')[:-1] +'0s' + sufix, 'en') |
| elif precision == 7: |
| try: |
| parsed_time = datetime.strptime(time, '%Y-00-00T00:00:%SZ') |
| except ValueError: |
| parsed_time = datetime.strptime(time, '%Y-%m-%dT00:00:%SZ') |
| finally: |
| return (turn_to_century_or_millennium( |
| parsed_time.strftime('%Y'), mode='C' |
| ) + sufix, 'en') |
| elif precision == 6: |
| try: |
| parsed_time = datetime.strptime(time, '%Y-00-00T00:00:%SZ') |
| except ValueError: |
| parsed_time = datetime.strptime(time, '%Y-%m-%dT00:00:%SZ') |
| finally: |
| return (turn_to_century_or_millennium( |
| parsed_time.strftime('%Y'), mode='M' |
| ) + sufix, 'en') |
| elif precision == 4: |
| timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y')) |
| timeint = round(timeint/1e5,1) |
| return (str(timeint) + 'hundred thousand years' + sufix, 'en') |
| elif precision == 3: |
| timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y')) |
| timeint = round(timeint/1e6,1) |
| return (str(timeint) + 'million years' + sufix, 'en') |
| elif precision == 0: |
| timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y')) |
| timeint = round(timeint/1e9,1) |
| return (str(timeint) + 'billion years' +sufix, 'en') |
| elif dt == dt_types[4]: |
| return (ast.literal_eval(dv)['value'], 'en') |
| except ValueError as e: |
| |
| raise e |
| |
| def get_object_desc_given_datatype(row): |
| Wd_API = wdutils.CachedWikidataAPI() |
| Wd_API.languages = ['en'] |
| dt = row['datatype'] |
| dv = row['datavalue'] |
| |
| dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string'] |
| if dt not in dt_types: |
| print(dt) |
| raise ValueError |
| else: |
| try: |
| if dt == dt_types[0]: |
| return Wd_API.get_desc(ast.literal_eval(dv)['value']['id']) |
| elif dt == dt_types[1]: |
| return ('no-desc', 'none') |
| elif dt == dt_types[2]: |
| dv = ast.literal_eval(dv) |
| amount, unit = dv['value']['amount'], dv['value']['unit'] |
| if amount[0] == '+': |
| amount = amount[1:] |
| if str(unit) == '1': |
| return ('no-desc', 'none') |
| else: |
| unit_entity_id = unit.split('/')[-1] |
| return Wd_API.get_desc(unit_entity_id) |
| elif dt == dt_types[3]: |
| return ('no-desc', 'none') |
| elif dt == dt_types[4]: |
| return ('no-desc', 'none') |
| except ValueError as e: |
| |
| raise e |
| |
| def get_object_alias_given_datatype(row): |
| Wd_API = wdutils.CachedWikidataAPI() |
| Wd_API.languages = ['en'] |
| dt = row['datatype'] |
| dv = row['datavalue'] |
| |
| dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string'] |
| if dt not in dt_types: |
| print(dt) |
| raise ValueError |
| else: |
| try: |
| if dt == dt_types[0]: |
| return Wd_API.get_alias(ast.literal_eval(dv)['value']['id']) |
| elif dt == dt_types[1]: |
| return ('no-alias', 'none') |
| elif dt == dt_types[2]: |
| dv = ast.literal_eval(dv) |
| amount, unit = dv['value']['amount'], dv['value']['unit'] |
| if amount[0] == '+': |
| amount = amount[1:] |
| if str(unit) == '1': |
| return ('no-alias', 'none') |
| else: |
| unit_entity_id = unit.split('/')[-1] |
| return Wd_API.get_alias(unit_entity_id) |
| elif dt == dt_types[3]: |
| dv = ast.literal_eval(dv) |
| time = dv['value']['time'] |
| timezone = dv['value']['timezone'] |
| precision = dv['value']['precision'] |
| assert dv['value']['after'] == 0 and dv['value']['before'] == 0 |
|
|
| sufix = 'BC' if time[0] == '-' else '' |
| time = time[1:] |
|
|
| if precision == 11: |
| return ([ |
| datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%-d of %B, %Y') + sufix, |
| datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%d/%m/%Y (dd/mm/yyyy)') + sufix, |
| datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%b %-d, %Y') + sufix |
| ], 'en') |
| else: |
| return ('no-alias', 'none') |
| elif dt == dt_types[4]: |
| return ('no-alias', 'none') |
| except ValueError as e: |
| |
| raise e |
|
|
| def textualAugmentation(filtered_df): |
|
|
| Wd_API = wdutils.CachedWikidataAPI() |
| Wd_API.languages = ['en'] |
|
|
| filtered_df['entity_label'] = filtered_df['entity_id'].apply(lambda x: Wd_API.get_label(x, True)) |
| filtered_df['entity_desc'] = filtered_df['entity_id'].apply(lambda x: Wd_API.get_desc(x)) |
| filtered_df['entity_alias'] = filtered_df['entity_id'].apply(lambda x: Wd_API.get_alias(x)) |
|
|
| print(' - Predicate augmentation...') |
| filtered_df['property_label'] = filtered_df['property_id'].apply(lambda x: Wd_API.get_label(x, True)) |
| filtered_df['property_desc'] = filtered_df['property_id'].apply(lambda x: Wd_API.get_desc(x)) |
| filtered_df['property_alias'] = filtered_df['property_id'].apply(lambda x: Wd_API.get_alias(x)) |
|
|
| print(' - Object augmentation...') |
| filtered_df['object_label'] = filtered_df.apply(get_object_label_given_datatype, axis=1) |
| filtered_df['object_desc'] = filtered_df.apply(get_object_desc_given_datatype, axis=1) |
| filtered_df['object_alias'] = filtered_df.apply(get_object_alias_given_datatype, axis=1) |
|
|
|
|
| no_subject_label_perc = filtered_df[filtered_df['entity_label'].apply(lambda x: x[0] == 'no-label')].shape[0] / filtered_df.shape[0] * 100 |
| print(' - No subject label %:', no_subject_label_perc, '%') |
|
|
| no_predicate_label_perc = filtered_df[filtered_df['property_label'].apply(lambda x: x[0] == 'no-label')].shape[0] / filtered_df.shape[0] * 100 |
| print(' - No predicate label %:', no_predicate_label_perc, '%') |
|
|
| no_object_label_perc = filtered_df[filtered_df['object_label'].apply(lambda x: x[0] == 'no-label')].shape[0] / filtered_df.shape[0] * 100 |
| print(' - No object label %:', no_object_label_perc, '%') |
| return filtered_df |
|
|
| def urlParser(target_QID): |
| Wd_API = wdutils.CachedWikidataAPI() |
| Wd_API.languages = ['en'] |
| db = sqlite3.connect('wikidata_claims_refs_parsed.db') |
| cursor = db.cursor() |
| refs_columns = ['reference_id','reference_property_id', 'reference_index', 'reference_datatype', 'reference_value'] |
| cursor.execute('select * from refs where reference_datatype="url";') |
| url_df = pd.DataFrame(cursor.fetchall()) |
| url_df.columns = refs_columns |
| def reference_value_to_url(reference_value): |
| if reference_value in ['novalue','somevalue']: |
| return reference_value |
| reference_value = ast.literal_eval(reference_value) |
| assert reference_value['type'] == 'string' |
| return reference_value['value'] |
| def reference_value_to_external_id(reference_value): |
| if reference_value in ['novalue','somevalue']: |
| return reference_value |
| reference_value = ast.literal_eval(reference_value) |
| assert reference_value['type'] == 'string' |
| return reference_value['value'] |
| def get_formatter_url(entity_id): |
| try: |
| sparql_query = ''' |
| SELECT ?item ?itemLabel |
| WHERE |
| { |
| wd:$1 wdt:P1630 ?item. |
| SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } |
| } |
| '''.replace('$1',entity_id) |
| sparql_results = Wd_API.query_sparql_endpoint(sparql_query) |
| if len(sparql_results['results']['bindings']) > 0: |
| return sparql_results['results']['bindings'][0]['item']['value'] |
| else: |
| return 'no_formatter_url' |
| except Exception: |
| print(entity_id) |
| print(sparql_results) |
| raise |
| url_df['url'] = url_df.reference_value.apply(reference_value_to_url) |
| cursor.execute('select * from refs where reference_datatype="url";') |
| ext_id_df = pd.DataFrame(cursor.fetchall()) |
| ext_id_df.columns = refs_columns |
| ext_id_df['ext_id'] = ext_id_df.reference_value.apply(reference_value_to_external_id) |
| ext_id_df['formatter_url'] = ext_id_df['reference_property_id'].apply(get_formatter_url) |
| ext_id_df['url'] = ext_id_df.apply(lambda x : x['formatter_url'].replace('$1', x['ext_id']), axis=1) |
| columns_for_join = ['reference_id', 'reference_property_id','reference_index','reference_datatype','url'] |
| url_df_pre_join = url_df[columns_for_join] |
| ext_id_df_pre_join = ext_id_df[columns_for_join] |
| all_url_df = pd.concat([url_df_pre_join,ext_id_df_pre_join]) |
| all_url_df = all_url_df.sort_values(['reference_id','reference_index']) |
| |
| all_url_df = all_url_df[all_url_df['url'] != 'no_formatter_url'].reset_index(drop=True) |
| |
| all_url_df = all_url_df[~all_url_df['url'].isin(['somevalue','novalue'])] |
| reference_id_counts = all_url_df.reference_id.value_counts().reset_index() |
| reference_id_counts.columns = ['reference_id', 'counts'] |
| reference_id_counts_equal_1 = reference_id_counts[reference_id_counts['counts'] == 1].reference_id.tolist() |
| all_url_df_eq1 = all_url_df[all_url_df.reference_id.isin(reference_id_counts_equal_1)] |
| all_url_df_eq1 = all_url_df_eq1.reset_index(drop=True).drop('reference_index', axis=1) |
| return all_url_df_eq1 |
|
|
| def htmlParser(url_set, qid): |
| text_reference_sampled_df = url_set |
| _RE_COMBINE_WHITESPACE = re.compile(r"\s+") |
| text_reference_sampled_df['html'] = None |
| for i, row in text_reference_sampled_df.iterrows(): |
|
|
| print(i, row.url) |
| try: |
| response = requests.get(row.url, timeout=10) |
| if response.status_code == 200: |
| html = response.text |
| text_reference_sampled_df.loc[i, 'html'] = html |
| else: |
| print(f"not response, {response.status_code}") |
| text_reference_sampled_df.loc[i, 'html'] = response.status_code |
| except requests.exceptions.Timeout: |
| print("Timeout occurred while fetching the URL:", row.url) |
| text_reference_sampled_df.loc[i, 'html'] = 'TimeOut' |
| pass |
| except Exception as e: |
| print("An error occurred:", str(e)) |
| pass |
| text_reference_sampled_df_html = text_reference_sampled_df.copy() |
| text_reference_sampled_df_html['entity_id'] = qid |
| return text_reference_sampled_df_html |
|
|
| def claim2text(html_set): |
| text_reference_sampled_df_html = html_set |
| Wd_API = wdutils.CachedWikidataAPI() |
| Wd_API.languages = ['en'] |
| db = sqlite3.connect('wikidata_claims_refs_parsed.db') |
| cursor = db.cursor() |
| claims_columns = ['entity_id','claim_id','rank','property_id','datatype','datavalue'] |
| refs_columns = ['reference_id', 'reference_property_id', 'reference_index', 'reference_datatype', 'reference_value'] |
|
|
| def reference_id_to_claim_id(reference_id): |
| cursor.execute(f'select claim_id from claims_refs where reference_id="{reference_id}"') |
| sql_result = cursor.fetchall() |
| |
| randomly_chosen_claim_id = np.array(sql_result).reshape(-1) |
| return randomly_chosen_claim_id |
| |
| def reference_id_to_claim_data(reference_id): |
| claim_ids = reference_id_to_claim_id(reference_id) |
| r = [] |
| for claim_id in claim_ids: |
| |
| cursor.execute(f'select * from claims where claim_id="{claim_id}";') |
| d = cursor.fetchall() |
| r = r + d |
| return r |
|
|
| claim_data = [] |
| for reference_id in text_reference_sampled_df_html.reference_id: |
| data = reference_id_to_claim_data(reference_id) |
| |
| data = [(reference_id,) + t for t in data] |
| claim_data = claim_data + data |
| |
|
|
| claim_df = pd.DataFrame(claim_data, columns = ['reference_id'] + claims_columns) |
| claim_df |
|
|
| def claim_id_to_claim_url(claim_id): |
| claim_id_parts = claim_id.split('$') |
| return f'https://www.wikidata.org/wiki/{claim_id_parts[0]}#{claim_id}' |
|
|
| BAD_DATATYPES = ['external-id','commonsMedia','url', 'globe-coordinate', 'wikibase-lexeme', 'wikibase-property'] |
|
|
| assert claim_df[~claim_df.datatype.isin(BAD_DATATYPES)].reference_id.unique().shape\ |
| == claim_df.reference_id.unique().shape |
|
|
| print(claim_df.reference_id.unique().shape[0]) |
| claim_df = claim_df[~claim_df.datatype.isin(BAD_DATATYPES)].reset_index(drop=True) |
|
|
| from tqdm.auto import tqdm |
| tqdm.pandas() |
|
|
| claim_df[['entity_label','entity_label_lan']] = pd.DataFrame( |
| claim_df.entity_id.progress_apply(Wd_API.get_label, non_language_set=True).tolist() |
| ) |
| claim_df[['property_label','property_label_lan']] = pd.DataFrame( |
| claim_df.property_id.progress_apply(Wd_API.get_label, non_language_set=True).tolist() |
| ) |
|
|
| claim_df[['entity_alias','entity_alias_lan']] = pd.DataFrame( |
| claim_df.entity_id.progress_apply(Wd_API.get_alias, non_language_set=True).tolist() |
| ) |
| claim_df[['property_alias','property_alias_lan']] = pd.DataFrame( |
| claim_df.property_id.progress_apply(Wd_API.get_alias, non_language_set=True).tolist() |
| ) |
|
|
| claim_df[['entity_desc','entity_desc_lan']] = pd.DataFrame( |
| claim_df.entity_id.progress_apply(Wd_API.get_desc, non_language_set=True).tolist() |
| ) |
| claim_df[['property_desc','property_desc_lan']] = pd.DataFrame( |
| claim_df.property_id.progress_apply(Wd_API.get_desc, non_language_set=True).tolist() |
| ) |
|
|
| claim_df['object_label'] = claim_df.apply(get_object_label_given_datatype, axis=1) |
| claim_df['object_alias'] = claim_df.apply(get_object_alias_given_datatype, axis=1) |
| claim_df['object_desc'] = claim_df.apply(get_object_desc_given_datatype, axis=1) |
|
|
| claim_df['object_label'], claim_df['object_label_lan'] = zip(*claim_df['object_label'].apply(lambda x: x if isinstance(x, tuple) else (x, ''))) |
| claim_df['object_alias'], claim_df['object_alias_lan'] = zip(*claim_df['object_alias'].apply(lambda x: x if isinstance(x, tuple) else (x, ''))) |
| claim_df['object_desc'], claim_df['object_desc_lan'] = zip(*claim_df['object_desc'].apply(lambda x: x if isinstance(x, tuple) else (x, ''))) |
|
|
| |
| claim_df = claim_df[claim_df['object_label_lan'] != 'none'].reset_index(drop=True) |
| return claim_df |
|
|
| def html2text(html_set): |
| reference_html_df = html_set |
| _RE_COMBINE_WHITESPACE = re.compile(r"\s+") |
| ft_model = fasttext.load_model('base/lid.176.ftz') |
| def predict_language(text, k=20): |
| ls, scores = ft_model.predict(text, k=k) |
| ls = [l.replace('__label__','') for l in ls] |
| return list(zip(ls,scores)) |
| def get_url_language(html): |
| try: |
| soup = BeautifulSoup(html, "lxml") |
| [s.decompose() for s in soup("script")] |
| if soup.body == None: |
| return ('no body', None) |
| body_text = _RE_COMBINE_WHITESPACE.sub(" ", soup.body.get_text(' ')).strip() |
| return predict_language(body_text, k=1)[0] |
| except Exception: |
| raise |
| def get_text_p_tags(soup): |
| p_tags = soup.find_all('p') |
| text = [p.getText().strip() for p in p_tags if p.getText()] |
| return '\n'.join(text) |
| def clean_text_line_by_line(text, join=True, ch_join = ' ', verb=True): |
| |
| |
| lines = list(text.splitlines()) |
| lines = (line.strip() for line in lines) |
| |
| lines = (re.sub(r' {2,}', ' ', line) for line in lines) |
| |
| lines = (re.sub(r' ([.,:;!?\\-])', r'\1', line) for line in lines) |
| |
| lines = [line+'.' if line and line[-1] not in string.punctuation else line for i, line in enumerate(lines)] |
| |
| if verb: |
| for i, line in enumerate(lines): |
| print(i,line) |
| |
| if join: |
| return ch_join.join([line for line in lines if line]) |
| else: |
| return [line for line in lines if line] |
|
|
| def apply_manual_rules(text): |
| |
| |
| |
| |
| |
| |
| |
| text = re.sub(r'\[[0-9]+\]', '', text) |
| return text |
| def retrieve_text_from_html(html, soup_parser = 'lxml', verb=True, join=True): |
| if not isinstance(html, str) or 'DOCTYPE html' not in html: |
| return 'No body' |
| soup = BeautifulSoup(html, soup_parser) |
| for script in soup(["script", "style"]): |
| script.decompose() |
| if soup.body == None: |
| return 'No body' |
| [s.unwrap() for s in soup.body.find_all('strong')] |
|
|
| for p in soup.body.find_all('p'): |
| p.string = _RE_COMBINE_WHITESPACE.sub(" ", p.get_text('')).strip() |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| text = soup.body.get_text(' ').strip() |
| |
| |
| text = apply_manual_rules(text) |
| text = clean_text_line_by_line(text, ch_join = ' ', verb=verb, join=join) |
|
|
| if not text: |
| return 'No extractable text' if join else ['No extractable text'] |
| else: |
| return text |
| i=0 |
| print(i) |
| print(reference_html_df.url.iloc[i]) |
|
|
| reference_html_df['extracted_sentences'] = reference_html_df.html.progress_apply(retrieve_text_from_html, join=False, verb=False) |
|
|
| join_ch = ' ' |
| reference_html_df['extracted_text'] = reference_html_df.extracted_sentences.apply(lambda x : join_ch.join(x)) |
|
|
| splitter = SentenceSplitter(language='en') |
|
|
| seg = pysbd.Segmenter(language="en", clean=False) |
| |
| if not spacy.util.is_package("en_core_web_lg"): |
| os.system("python -m spacy download en_core_web_lg") |
|
|
| nlp = spacy.load("en_core_web_lg") |
|
|
| text = reference_html_df.loc[0,'extracted_text'] |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
| sents = [s for s in nlp(text).sents] |
|
|
|
|
| reference_html_df['nlp_sentences'] = reference_html_df.extracted_text.progress_apply(lambda x : [str(s) for s in nlp(x).sents]) |
| reference_html_df['nlp_sentences_slide_2'] = reference_html_df['nlp_sentences'].progress_apply( |
| lambda x : [' '.join([a,b]) for a,b in list(zip(x,x[1:]+['']))] |
| ) |
|
|
| assert type(reference_html_df.loc[0,'nlp_sentences']) == list |
| assert type(reference_html_df.loc[0,'nlp_sentences'][0]) == str |
| assert type(reference_html_df.loc[0,'nlp_sentences_slide_2']) == list |
| assert type(reference_html_df.loc[0,'nlp_sentences_slide_2'][0]) == str |
| return reference_html_df |
|
|
| if __name__ == '__main__': |
| conn = sqlite3.connect('wikidata_claims_refs_parsed.db') |
| target_QID = 'Q3621696' |
| claimParser(target_QID) |
| filtered_df = propertyFiltering(target_QID) |
| url_set = urlParser(target_QID) |
| html_set = htmlParser(url_set, target_QID) |
| try: |
| claim_text = claim2text(html_set) |
| html_text = html2text(html_set) |
| claim_text = claim_text.astype(str) |
| html_text = html_text.astype(str) |
| claim_text.to_sql('claim_text', conn, if_exists='replace', index=False) |
| html_text.to_sql('html_text', conn, if_exists='replace', index=False) |
| except Exception as e: |
| print(f"No accessible html documents") |
| |
|
|
| conn.commit() |
| conn.close() |
| |
| |