diff --git a/.github/workflows/prove-shared-ci.yml b/.github/workflows/prove-shared-ci.yml new file mode 100644 index 0000000..bd6565c --- /dev/null +++ b/.github/workflows/prove-shared-ci.yml @@ -0,0 +1,33 @@ +name: prove-shared-ci + +on: + push: + paths: + - "prove-shared/**" + - ".github/workflows/prove-shared-ci.yml" + pull_request: + paths: + - "prove-shared/**" + - ".github/workflows/prove-shared-ci.yml" + +jobs: + test-prove-shared: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install package and test dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -e ./prove-shared[test] + + - name: Run tests + run: | + python -m pytest prove-shared/tests -q diff --git a/.gitignore b/.gitignore index dbf1d51..fb3d5aa 100755 --- a/.gitignore +++ b/.gitignore @@ -167,6 +167,8 @@ entity_cache.p wikidata_claims_refs_parsed.db results.csv *.sh +!prove-api/scripts/*.sh +!prove-processing/scripts/*.sh API_key.txt *.out CodeArchive/ @@ -178,6 +180,7 @@ output.log2.gz tester.py api/secrets.py utils/secrets.py +**/local_secrets.py db/ tmp/ .claude/settings.local.json diff --git a/README.new.md b/README.new.md new file mode 100644 index 0000000..fc164fd --- /dev/null +++ b/README.new.md @@ -0,0 +1,185 @@ +# ProVe (Provenance Verification for Wikidata claims) + + +## Overview + +ProVe is a system designed to automatically verify claims and references in Wikidata. It extracts claims from Wikidata entities, fetches the referenced URLs, processes the HTML content, and uses NLP models to determine whether the claims are supported by the referenced content. + +It: +1. extracts claims and references from a Wikidata item, +2. fetches reference content from external URLs, +3. selects evidence sentences, +4. runs textual entailment, +5. stores and serves results through API and background services. + +## Current Repository Structure + +The codebase is now organized into three top-level folders inside this workspace: + +- prove-api: HTTP/API layer, dashboard, templates, docs, queue endpoint +- prove-processing: background workers, pipeline orchestration, ML/NLP models +- prove-shared: pip-installable shared package (MongoDB models/handlers, auth, utilities) + +Root-level files still include global project metadata such as pyproject.toml, README.md, LICENSE, and project planning docs. + +## Architecture Summary + +### 1) Data Collection and Processing + +- WikidataParser extracts claims and reference URLs from QIDs. +- HTMLFetcher downloads referenced pages (requests/selenium fallback). +- HTMLSentenceProcessor turns HTML into candidate evidence sentences. + +### 2) Evidence Selection and Verification + +- EvidenceSelector ranks candidate evidence against claims. +- ClaimEntailmentChecker classifies SUPPORTS / REFUTES / NOT ENOUGH INFO. + +### 3) NLP Models + +- TextualEntailmentModule (BERT-FEVER style entailment) +- SentenceRetrievalModule (sentence relevance scoring) +- VerbModule (graph statement verbalization) + +### 4) Storage + +- MongoDB: html content, entailment outputs, parser stats, queue/status +- SQLite: historical/aggregated data used by API logic in legacy paths + +## Shared Package (prove-shared) + +The shared package is installable and used by API and processing code. + +### Local install + +From root: + +```bash +uv sync +# or +pip install . +``` + +Root pyproject.toml includes a local path dependency to install prove_shared from prove-shared. + +### Direct shared install + +```bash +cd prove-shared +pip install -e . +``` + +### Import style + +```python +from prove_shared import MongoDBHandler, AsyncAuth, Status +from prove_shared.mongo_handler import requestItemProcessing +``` + +## Setup Instructions + +## 1) Python environment + +Use Python 3.10.16 as declared in project metadata. + +## 2) Install dependencies + +Install from root: + +```bash +pip install . +``` + +## 3) Download model assets + +The base model assets are still required for processing pipelines. + +Download: + +https://emckclac-my.sharepoint.com/:u:/g/personal/stty3154_kcl_ac_uk/IQDeSEYuxxRDSp-zJovVXvbRAVmhmXRw97g7D0eLmJIKyUs?e=Iq446V + +Place the base folder at the expected location used by model paths in processing modules. + +## 4) Runtime secrets + +Environment-specific secrets files are required and should remain gitignored. + +Key examples: + +- prove-shared/src/prove_shared/local_secrets.py +- prove-api/api/local_secrets.py (if used by API modules) +- prove-processing/utils/local_secrets.py (legacy paths still referenced by some processing code) + +## 5) Configuration + +Shared runtime settings are in: + +- prove-shared/config.yaml + +Includes DB paths, batch sizes, thresholds, and algorithm version. + +## How to Run + +## Processing a single entity + +```python +from ProVe_main_process import initialize_models, process_entity + +models = initialize_models() +qid = "Q44" +html_df, entailment_results, parser_stats = process_entity(qid, models) +``` + +## Start processing service + +```bash +cd prove-processing +python ProVe_main_service.py +``` + +## Start API service + +```bash +cd prove-api +python api/app.py +``` + +## Background processing + +The scheduler can process: + +- top viewed Wikidata items, +- pagepile list items, +- heuristic/random QID queues. + +## Data Flow + +1. API or scheduler enqueues a QID. +2. Processing worker fetches queue task. +3. Parser extracts claims + reference URLs. +4. HTML collector fetches and stores page content metadata. +5. Evidence selector ranks candidate sentences. +6. Entailment model classifies claim-evidence relationship. +7. Results are written to MongoDB and served by API. + +## Notes on Ongoing Split + +This repository currently contains all three components in one workspace folder, but structure and imports are being aligned for independent repository operation: + +- prove-api +- prove-processing +- prove-shared + +Project planning details are documented in project.md. + +## Legacy Information Preserved from Previous README + +The original README emphasized: + +- parser/fetcher/evidence/entailment pipeline, +- MongoDB + SQLite storage model, +- service entry points, +- configuration in config.yaml, +- required external model folder. + +All of these remain applicable, now mapped to the split folder layout above. diff --git a/api/db/website.py b/api/db/website.py deleted file mode 100644 index 2201a16..0000000 --- a/api/db/website.py +++ /dev/null @@ -1,22 +0,0 @@ -# @repo: api -# @description: SQLAlchemy model for newsletter subscriber persistence (API web layer) -from flask_sqlalchemy import SQLAlchemy - -db = SQLAlchemy() - -class NewsletterSubscriber(db.Model): - id = db.Column(db.Integer, primary_key=True) - name = db.Column(db.String(120), nullable=False) - email = db.Column(db.String(120), unique=True, nullable=False) - -class User(db.Model): - id = db.Column(db.Integer, primary_key=True) - username = db.Column(db.String(80), unique=True, nullable=False) - password = db.Column(db.String(120), nullable=False) - submission = db.relationship('Submission', backref='user') - -class Submission(db.Model): - filename = db.Column(db.String(120), nullable=False, primary_key=True) - user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False) - upload_time = db.Column(db.DateTime, nullable=False) - diff --git a/api/index.html b/prove-api/api_index.html similarity index 100% rename from api/index.html rename to prove-api/api_index.html diff --git a/api/app.py b/prove-api/app.py similarity index 97% rename from api/app.py rename to prove-api/app.py index a871a1c..4b1e659 100644 --- a/api/app.py +++ b/prove-api/app.py @@ -5,14 +5,16 @@ from flask import Flask, jsonify, request, render_template_string from flask_cors import CORS from flasgger import Swagger, swag_from -import sys import json -from custom_decorators import log_request, api_required, AsyncAuth -from local_secrets import CODE_PATH -from queue_manager import QueueManager +try: + from custom_decorators import log_request, api_required + from queue_manager import QueueManager +except ImportError: + from api.custom_decorators import log_request, api_required + from api.queue_manager import QueueManager -sys.path.append(CODE_PATH) +from prove_shared.auth import AsyncAuth import functions diff --git a/prove-api/config.yaml b/prove-api/config.yaml new file mode 100644 index 0000000..0027578 --- /dev/null +++ b/prove-api/config.yaml @@ -0,0 +1,40 @@ +# TODO: This is a duplicate of the root config.yaml for the prove-api package. +# Once the split is finalised, each package should only contain the settings it needs. + +database: + name: 'wikidata_claims_refs_parsed.db' + result_db_for_API: '/home/ubuntu/mntdisk/reference_checked.db' + +queue: + heuristic: 'random' + +version: + algo_version: '1.1.1' + +parsing: + reset_database: True # This is a developer mode to clean-up DB to test soemthing + +spacy: + model: 'en_core_web_sm' + +html_fetching: + batch_size: 10 + delay: 1.0 + fetching_driver: 'chrome' # available options: 'chrome' or 'requests' + timeout: 15 + +logging: + level: 'INFO' + format: '%(asctime)s - %(levelname)s - %(message)s' + +text_processing: + sentence_slide: + enabled: true + window_size: 2 # sliding window for masking sentences + join_char: ' ' + +evidence_selection: + batch_size: 256 + n_top_sentences: 5 + score_threshold: 0 + token_size: 512 diff --git a/api/custom_decorators.py b/prove-api/custom_decorators.py similarity index 94% rename from api/custom_decorators.py rename to prove-api/custom_decorators.py index 318bea8..35071f1 100644 --- a/api/custom_decorators.py +++ b/prove-api/custom_decorators.py @@ -4,24 +4,21 @@ from base64 import b64encode, b64decode from functools import wraps from flask import request -import os import threading import time from typing import Any, Union -import sys from pymongo import MongoClient try: from utils_api import get_ip_location, logger - from local_secrets import CODE_PATH, SOURCE, API_KEY, PRIVATE_KEY + from local_secrets import SOURCE, API_KEY, PRIVATE_KEY except ImportError: from api.utils_api import get_ip_location, logger - from api.local_secrets import CODE_PATH, SOURCE, API_KEY, PRIVATE_KEY + from api.local_secrets import SOURCE, API_KEY, PRIVATE_KEY -sys.path.append(CODE_PATH) -from utils.mongo_handler import MongoDBHandler -from utils.auth import AsyncAuth +from prove_shared.mongo_handler import MongoDBHandler +from prove_shared.auth import AsyncAuth class StatsDBHandler(MongoDBHandler): diff --git a/dashboard.py b/prove-api/dashboard.py similarity index 90% rename from dashboard.py rename to prove-api/dashboard.py index 87919ba..2f3bf99 100644 --- a/dashboard.py +++ b/prove-api/dashboard.py @@ -197,9 +197,17 @@ def create_geo_choropleth(df_geo): return fig -def create_top_countries_chart(df_geo, top_n=10): +def _is_valid_location_label(value): + """Return True when a location label is present and meaningful.""" + if pd.isna(value): + return False + value_str = str(value).strip() + return value_str not in {'', 'Not found', 'None', 'null', 'nan'} + + +def create_top_countries_chart(df_geo, top_n=20): """Create top countries bar chart""" - df_top = df_geo.head(top_n) + df_top = df_geo[df_geo['country_code'].apply(_is_valid_location_label)].head(top_n) fig = go.Figure() @@ -216,18 +224,18 @@ def create_top_countries_chart(df_geo, top_n=10): xaxis_title='Number of Requests', yaxis_title='Country Code', template='plotly_white', - height=400, + height=650, yaxis={'categoryorder': 'total ascending'} ) return fig -def create_top_cities_chart(data, top_n=10): +def create_top_cities_chart(data, top_n=20): """Create top cities bar chart""" cities = data.get('city', {}) city_data = [{'city': city, 'count': count} for city, count in cities.items() - if city != 'Not found' and city != ''] + if _is_valid_location_label(city)] df_cities = pd.DataFrame(city_data).sort_values('count', ascending=False).head(top_n) fig = go.Figure() @@ -245,7 +253,7 @@ def create_top_cities_chart(data, top_n=10): xaxis_title='Number of Requests', yaxis_title='City', template='plotly_white', - height=400, + height=650, yaxis={'categoryorder': 'total ascending'} ) @@ -276,19 +284,41 @@ def create_execution_time_boxplot(df_requests): def create_request_distribution_pie(df_requests): - """Create request distribution pie chart""" - fig = go.Figure() + """Create readable request distribution pie chart with conditional labels.""" + df_dist = df_requests[['endpoint', 'count']].copy() + df_dist = df_dist.sort_values('count', ascending=False) + + total_count = df_dist['count'].sum() + df_dist['raw_percentage'] = (df_dist['count'] / total_count * 100) + df_dist = df_dist[df_dist['raw_percentage'] >= 0.05].copy() + df_dist['percentage'] = df_dist['raw_percentage'].round(1) + df_dist = df_dist.drop(columns=['raw_percentage']) + text_labels = [ + f"{pct:.1f}%" if pct >= 1.0 else '' + for pct in df_dist['percentage'] + ] + fig = go.Figure() fig.add_trace(go.Pie( - labels=df_requests['endpoint'], - values=df_requests['count'], - hovertemplate='%{label}
Count: %{value:,}
Percentage: %{percent}' + labels=df_dist['endpoint'], + values=df_dist['count'], + customdata=df_dist['percentage'], + text=text_labels, + texttemplate='%{text}', + textposition='outside', + textfont=dict(size=14), + hovertemplate='%{label}
Count: %{value:,}
Percentage: %{customdata:.1f}%', + sort=False, + marker=dict(line=dict(color='white', width=2)) )) fig.update_layout( title='Request Distribution by Endpoint', template='plotly_white', - height=400 + height=560, + uniformtext_minsize=12, + uniformtext_mode='hide', + legend=dict(title='Endpoint') ) return fig @@ -389,10 +419,10 @@ def build_overview_tab(data, kpis, df_monthly, df_requests): dbc.Row([ dbc.Col([ dcc.Graph(figure=create_request_performance_chart(df_requests)) - ], md=7), + ], md=6), dbc.Col([ dcc.Graph(figure=create_request_distribution_pie(df_requests)) - ], md=5), + ], md=6), ], className='mb-4'), ], fluid=True) ]) @@ -402,21 +432,18 @@ def build_geography_tab(df_geo, data): """Build Geographic Analysis tab content""" return dbc.Tab(label='Geographic Analysis', tab_id='geography', children=[ dbc.Container([ - # Choropleth Map + # Top Countries dbc.Row([ dbc.Col([ - dcc.Graph(figure=create_geo_choropleth(df_geo)) + dcc.Graph(figure=create_top_countries_chart(df_geo, top_n=20)) ], md=12) ], className='mb-4'), - # Top Countries and Cities + # Top Cities dbc.Row([ dbc.Col([ - dcc.Graph(figure=create_top_countries_chart(df_geo, top_n=10)) - ], md=6), - dbc.Col([ - dcc.Graph(figure=create_top_cities_chart(data, top_n=10)) - ], md=6), + dcc.Graph(figure=create_top_cities_chart(data, top_n=20)) + ], md=12), ], className='mb-4'), ], fluid=True) ]) @@ -531,12 +558,15 @@ def create_dashboard(): return app +app = create_dashboard() +server = app.server + + # ============================================================================ # MAIN EXECUTION # ============================================================================ if __name__ == '__main__': - app = create_dashboard() print("\n" + "="*60) print("Dashboard is running!") print("Open your browser and navigate to: http://127.0.0.1:8050") diff --git a/api/docs/api/config.yml b/prove-api/docs/api/config.yml similarity index 100% rename from api/docs/api/config.yml rename to prove-api/docs/api/config.yml diff --git a/api/docs/api/items/checkItemStatus.yml b/prove-api/docs/api/items/checkItemStatus.yml similarity index 100% rename from api/docs/api/items/checkItemStatus.yml rename to prove-api/docs/api/items/checkItemStatus.yml diff --git a/api/docs/api/items/comprehensiveResults.yml b/prove-api/docs/api/items/comprehensiveResults.yml similarity index 100% rename from api/docs/api/items/comprehensiveResults.yml rename to prove-api/docs/api/items/comprehensiveResults.yml diff --git a/api/docs/api/items/getSimpleResult.yml b/prove-api/docs/api/items/getSimpleResult.yml similarity index 100% rename from api/docs/api/items/getSimpleResult.yml rename to prove-api/docs/api/items/getSimpleResult.yml diff --git a/api/docs/api/items/history.yml b/prove-api/docs/api/items/history.yml similarity index 100% rename from api/docs/api/items/history.yml rename to prove-api/docs/api/items/history.yml diff --git a/api/docs/api/items/summary.yml b/prove-api/docs/api/items/summary.yml similarity index 100% rename from api/docs/api/items/summary.yml rename to prove-api/docs/api/items/summary.yml diff --git a/api/docs/api/requests/requestItem.yml b/prove-api/docs/api/requests/requestItem.yml similarity index 100% rename from api/docs/api/requests/requestItem.yml rename to prove-api/docs/api/requests/requestItem.yml diff --git a/api/docs/api/task/checkCompleted.yml b/prove-api/docs/api/task/checkCompleted.yml similarity index 100% rename from api/docs/api/task/checkCompleted.yml rename to prove-api/docs/api/task/checkCompleted.yml diff --git a/api/docs/api/task/checkErrors.yml b/prove-api/docs/api/task/checkErrors.yml similarity index 100% rename from api/docs/api/task/checkErrors.yml rename to prove-api/docs/api/task/checkErrors.yml diff --git a/api/docs/api/task/checkQueue.yml b/prove-api/docs/api/task/checkQueue.yml similarity index 100% rename from api/docs/api/task/checkQueue.yml rename to prove-api/docs/api/task/checkQueue.yml diff --git a/api/docs/api/worklist/generationBasics.yml b/prove-api/docs/api/worklist/generationBasics.yml similarity index 100% rename from api/docs/api/worklist/generationBasics.yml rename to prove-api/docs/api/worklist/generationBasics.yml diff --git a/api/docs/page/plot.yml b/prove-api/docs/page/plot.yml similarity index 100% rename from api/docs/page/plot.yml rename to prove-api/docs/page/plot.yml diff --git a/api/docs/page/worklist/generationBasics.yml b/prove-api/docs/page/worklist/generationBasics.yml similarity index 100% rename from api/docs/page/worklist/generationBasics.yml rename to prove-api/docs/page/worklist/generationBasics.yml diff --git a/api/docs/page/worklist/pagePileList.yml b/prove-api/docs/page/worklist/pagePileList.yml similarity index 100% rename from api/docs/page/worklist/pagePileList.yml rename to prove-api/docs/page/worklist/pagePileList.yml diff --git a/api/docs/process_reference.yml b/prove-api/docs/process_reference.yml similarity index 100% rename from api/docs/process_reference.yml rename to prove-api/docs/process_reference.yml diff --git a/front/common.js b/prove-api/front/common.js similarity index 100% rename from front/common.js rename to prove-api/front/common.js diff --git a/functions.py b/prove-api/functions.py similarity index 98% rename from functions.py rename to prove-api/functions.py index fd158df..d61cd4c 100644 --- a/functions.py +++ b/prove-api/functions.py @@ -17,10 +17,13 @@ from pymongo import collection import yaml -from utils.logger import logger -from utils.mongo_handler import MongoDBHandler -from utils.mongo_handler import requestItemProcessing as request_processing -from utils.objects import Status, HtmlContent, Entailment +import logging + +from prove_shared.mongo_handler import MongoDBHandler +from prove_shared.mongo_handler import requestItemProcessing as request_processing +from prove_shared.objects import Status, HtmlContent, Entailment + +logger = logging.getLogger("prove_api") mongo_handler = MongoDBHandler() @@ -837,6 +840,9 @@ def get_config_as_json(): def process_reference(url: str, claim: str): + # TODO: This function imports from prove-processing (inference side). + # Once the split is complete, this endpoint should call prove-processing via HTTP API + # instead of direct Python imports. import nltk import requests import pandas as pd diff --git a/api/hackathon/api_code.py b/prove-api/hackathon/api_code.py similarity index 97% rename from api/hackathon/api_code.py rename to prove-api/hackathon/api_code.py index 8a1a4ba..afa6ac0 100644 --- a/api/hackathon/api_code.py +++ b/prove-api/hackathon/api_code.py @@ -8,7 +8,10 @@ from hashlib import sha256 from db.website import db, NewsletterSubscriber from db.website import User, Submission -from local_secrets import UPLOAD_FOLDER, ALLOWED_EXTENSIONS +try: + from local_secrets import UPLOAD_FOLDER, ALLOWED_EXTENSIONS +except ImportError: + from api.local_secrets import UPLOAD_FOLDER, ALLOWED_EXTENSIONS app.secret_key = sha256(os.urandom(16)).hexdigest() app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///data.db' diff --git a/index.html b/prove-api/index.html similarity index 100% rename from index.html rename to prove-api/index.html diff --git a/info.py b/prove-api/info.py similarity index 95% rename from info.py rename to prove-api/info.py index 811b69b..aa7d3c7 100644 --- a/info.py +++ b/prove-api/info.py @@ -1,18 +1,19 @@ # @repo: api # @description: Collects and aggregates API usage statistics from MongoDB for reporting and the dashboard from collections import defaultdict -from api.custom_decorators import StatsDBHandler -import sys from tqdm import tqdm import time import numpy as np -from api.local_secrets import CODE_PATH -from api.utils_api import get_ip_location +try: + from custom_decorators import StatsDBHandler + from utils_api import get_ip_location +except ImportError: + from api.custom_decorators import StatsDBHandler + from api.utils_api import get_ip_location -sys.path.append(CODE_PATH) from pymongo import MongoClient -from ProVe_main_service import MongoDBHandler +from prove_shared.mongo_handler import MongoDBHandler class TMPStatsDBHandler(MongoDBHandler): diff --git a/api/page/plot.yml b/prove-api/page/plot.yml similarity index 100% rename from api/page/plot.yml rename to prove-api/page/plot.yml diff --git a/api/page/worklist/generationBasics.yml b/prove-api/page/worklist/generationBasics.yml similarity index 100% rename from api/page/worklist/generationBasics.yml rename to prove-api/page/worklist/generationBasics.yml diff --git a/api/page/worklist/pagePileList.yml b/prove-api/page/worklist/pagePileList.yml similarity index 100% rename from api/page/worklist/pagePileList.yml rename to prove-api/page/worklist/pagePileList.yml diff --git a/prove-api/pyproject.toml b/prove-api/pyproject.toml new file mode 100644 index 0000000..b7af5e4 --- /dev/null +++ b/prove-api/pyproject.toml @@ -0,0 +1,28 @@ +[build-system] +requires = ["setuptools>=69", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "prove-api" +version = "1.0.0" +description = "ProVe User Service — Flask API, dashboard, and queue management" +requires-python = ">=3.10" + +dependencies = [ + "prove_shared", + "flask", + "flask-cors", + "flasgger", + "flask-sqlalchemy", + "dash[cloud]", + "dash-bootstrap-components", + "plotly", + "schedule", + "pandas", +] + +[tool.setuptools] +packages = [] + +[tool.uv.sources] +prove_shared = { path = "../prove-shared" } diff --git a/api/queue_manager.py b/prove-api/queue_manager.py similarity index 96% rename from api/queue_manager.py rename to prove-api/queue_manager.py index 093b804..9955713 100644 --- a/api/queue_manager.py +++ b/prove-api/queue_manager.py @@ -3,15 +3,17 @@ from collections import defaultdict from threading import BoundedSemaphore from typing import Any, Dict, Union -import sys from pymongo import collection -from local_secrets import CODE_PATH, MAX_CONNECTIONS -from utils_api import logger +try: + from local_secrets import MAX_CONNECTIONS + from utils_api import logger +except ImportError: + from api.local_secrets import MAX_CONNECTIONS + from api.utils_api import logger -sys.path.append(CODE_PATH) -from utils.mongo_handler import MongoDBHandler +from prove_shared.mongo_handler import MongoDBHandler class QueueManager: diff --git a/scripts/restart.sh b/prove-api/scripts/restart.sh old mode 100755 new mode 100644 similarity index 100% rename from scripts/restart.sh rename to prove-api/scripts/restart.sh diff --git a/api/static/style.css b/prove-api/static/style.css similarity index 100% rename from api/static/style.css rename to prove-api/static/style.css diff --git a/swagger.json b/prove-api/swagger.json similarity index 100% rename from swagger.json rename to prove-api/swagger.json diff --git a/api/templates/hackathon.html b/prove-api/templates/hackathon.html similarity index 100% rename from api/templates/hackathon.html rename to prove-api/templates/hackathon.html diff --git a/api/templates/prove.html b/prove-api/templates/prove.html similarity index 100% rename from api/templates/prove.html rename to prove-api/templates/prove.html diff --git a/test_functions.py b/prove-api/test_functions.py similarity index 100% rename from test_functions.py rename to prove-api/test_functions.py diff --git a/api/utils_api.py b/prove-api/utils_api.py similarity index 100% rename from api/utils_api.py rename to prove-api/utils_api.py diff --git a/api/wsgi.py b/prove-api/wsgi.py similarity index 100% rename from api/wsgi.py rename to prove-api/wsgi.py diff --git a/ProVe_heuristic_service.py b/prove-processing/ProVe_heuristic_service.py similarity index 98% rename from ProVe_heuristic_service.py rename to prove-processing/ProVe_heuristic_service.py index 70b8a1a..ff9ebff 100644 --- a/ProVe_heuristic_service.py +++ b/prove-processing/ProVe_heuristic_service.py @@ -1,12 +1,14 @@ # @repo: processing # @description: Alternative background worker that generates random QIDs via heuristics and enqueues them for processing import random +import logging import sys from typing import List from background_processing import process_system_qid from ProVe_main_service import ProVeService -from utils.logger import logger + +logger = logging.getLogger("prove_processing") class HeuristicBasedService(ProVeService): diff --git a/ProVe_main_process.py b/prove-processing/ProVe_main_process.py similarity index 100% rename from ProVe_main_process.py rename to prove-processing/ProVe_main_process.py diff --git a/ProVe_main_service.py b/prove-processing/ProVe_main_service.py old mode 100755 new mode 100644 similarity index 96% rename from ProVe_main_service.py rename to prove-processing/ProVe_main_service.py index efda140..0a868a0 --- a/ProVe_main_service.py +++ b/prove-processing/ProVe_main_service.py @@ -1,6 +1,7 @@ # @repo: processing # @description: Background worker service — consumes queue items from the API, runs the pipeline, and writes results to MongoDB from datetime import datetime +import logging import time from threading import Lock from typing import List, Dict, Any, Union @@ -20,10 +21,13 @@ process_pagepile_list, ) import ProVe_main_process -from utils.logger import logger -from utils.mongo_handler import MongoDBHandler -from utils.local_secrets import ENDPOINT, API_KEY -from utils.auth import AsyncAuth +from prove_shared.mongo_handler import MongoDBHandler +from prove_shared.local_secrets import ENDPOINT, API_KEY +from prove_shared.auth import AsyncAuth + +# TODO: Approach for logging to be decided — not using prove_shared logger for now, +# each service sets up its own minimal logging. +logger = logging.getLogger("prove_processing") try: @@ -227,6 +231,9 @@ def main_loop(self, status_dict: Dict[str, Any]) -> None: self.mongo_handler.save_status(status_dict) logger.info("Updated new status_dict into status") try: + # TODO: This imports from prove-api (user service side). + # Once the split is complete, this should call prove-api via HTTP API + # instead of direct Python imports. from functions import get_summary get_summary(qid, update=True) except Exception: diff --git a/background_processing.py b/prove-processing/background_processing.py similarity index 97% rename from background_processing.py rename to prove-processing/background_processing.py index 9cbea8e..be1a16e 100644 --- a/background_processing.py +++ b/prove-processing/background_processing.py @@ -1,14 +1,16 @@ # @repo: processing # @description: Scheduled task management — fetches top-viewed Wikipedia items and pagepile lists to enqueue QIDs for processing from datetime import datetime, timedelta +import logging import random import pandas as pd import requests import yaml -from utils.logger import logger -from utils.mongo_handler import MongoDBHandler, requestItemProcessing +from prove_shared.mongo_handler import MongoDBHandler, requestItemProcessing + +logger = logging.getLogger("prove_processing") # Load config diff --git a/claim_entailment.py b/prove-processing/claim_entailment.py similarity index 99% rename from claim_entailment.py rename to prove-processing/claim_entailment.py index a57edb5..21c92dc 100644 --- a/claim_entailment.py +++ b/prove-processing/claim_entailment.py @@ -9,7 +9,7 @@ from tqdm import tqdm from datetime import datetime -from utils.logger import logger +logger = logging.getLogger("prove_processing") class ClaimEntailmentChecker: def __init__(self, config_path: str = 'config.yaml', text_entailment=None): diff --git a/prove-processing/config.yaml b/prove-processing/config.yaml new file mode 100644 index 0000000..166e488 --- /dev/null +++ b/prove-processing/config.yaml @@ -0,0 +1,40 @@ +# TODO: This is a duplicate of the root config.yaml for the prove-processing package. +# Once the split is finalised, each package should only contain the settings it needs. + +database: + name: 'wikidata_claims_refs_parsed.db' + result_db_for_API: '/home/ubuntu/mntdisk/reference_checked.db' + +queue: + heuristic: 'random' + +version: + algo_version: '1.1.1' + +parsing: + reset_database: True # This is a developer mode to clean-up DB to test soemthing + +spacy: + model: 'en_core_web_sm' + +html_fetching: + batch_size: 10 + delay: 1.0 + fetching_driver: 'chrome' # available options: 'chrome' or 'requests' + timeout: 15 + +logging: + level: 'INFO' + format: '%(asctime)s - %(levelname)s - %(message)s' + +text_processing: + sentence_slide: + enabled: true + window_size: 2 # sliding window for masking sentences + join_char: ' ' + +evidence_selection: + batch_size: 256 + n_top_sentences: 5 + score_threshold: 0 + token_size: 512 diff --git a/properties_to_remove.json b/prove-processing/properties_to_remove.json similarity index 100% rename from properties_to_remove.json rename to prove-processing/properties_to_remove.json diff --git a/prove-processing/pyproject.toml b/prove-processing/pyproject.toml new file mode 100644 index 0000000..968e036 --- /dev/null +++ b/prove-processing/pyproject.toml @@ -0,0 +1,21 @@ +[project] +name = "prove-processing" +version = "1.0.0" +description = "ProVe Inference Service — NLP pipeline for claim verification" +requires-python = ">=3.10" + +dependencies = [ + "prove-shared", + "torch", + "transformers==4.46.3", + "pytorch_lightning==2.4.0", + "qwikidata", + "nltk", + "bs4", + "selenium", + "html2text", + "rouge_score", + "sacrebleu", + "sparqlwrapper", + "schedule", +] diff --git a/refs_html_collection.py b/prove-processing/refs_html_collection.py similarity index 99% rename from refs_html_collection.py rename to prove-processing/refs_html_collection.py index a920123..4921810 100644 --- a/refs_html_collection.py +++ b/prove-processing/refs_html_collection.py @@ -1,6 +1,7 @@ # @repo: processing # @description: Fetches HTML content from reference URLs (supports requests + Selenium), with batching and status tracking from typing import Dict, Any, List +import logging import yaml import requests import time @@ -9,7 +10,7 @@ from selenium.webdriver.chrome.options import Options import pandas as pd -from utils.logger import logger +logger = logging.getLogger("prove_processing") def load_config(config_path: str) -> Dict[str, Any]: diff --git a/refs_html_to_evidences.py b/prove-processing/refs_html_to_evidences.py similarity index 99% rename from refs_html_to_evidences.py rename to prove-processing/refs_html_to_evidences.py index dad95f2..dcb661c 100644 --- a/refs_html_to_evidences.py +++ b/prove-processing/refs_html_to_evidences.py @@ -1,6 +1,7 @@ # @repo: processing # @description: Converts fetched HTML into candidate evidence sentences and ranks them by relevance using SentenceRetrievalModule + VerbModule import pandas as pd +import logging import nltk import html2text import requests @@ -9,7 +10,7 @@ from utils.sentence_retrieval_module import SentenceRetrievalModule import numpy as np -from utils.logger import logger +logger = logging.getLogger("prove_processing") class HTMLSentenceProcessor: diff --git a/scripts/dr_backup.sh b/prove-processing/scripts/dr_backup.sh similarity index 100% rename from scripts/dr_backup.sh rename to prove-processing/scripts/dr_backup.sh diff --git a/scripts/historical_backup.sh b/prove-processing/scripts/historical_backup.sh similarity index 100% rename from scripts/historical_backup.sh rename to prove-processing/scripts/historical_backup.sh diff --git a/utils/bert_model.py b/prove-processing/utils/bert_model.py old mode 100755 new mode 100644 similarity index 99% rename from utils/bert_model.py rename to prove-processing/utils/bert_model.py index 784c36d..6adc9cd --- a/utils/bert_model.py +++ b/prove-processing/utils/bert_model.py @@ -34,7 +34,7 @@ from torch import nn from torch.nn import CrossEntropyLoss -from utils.file_utils import cached_path +from prove_shared.file_utils import cached_path logger = logging.getLogger(__name__) diff --git a/utils/callbacks.py b/prove-processing/utils/callbacks.py old mode 100755 new mode 100644 similarity index 100% rename from utils/callbacks.py rename to prove-processing/utils/callbacks.py diff --git a/utils/finetune.py b/prove-processing/utils/finetune.py old mode 100755 new mode 100644 similarity index 100% rename from utils/finetune.py rename to prove-processing/utils/finetune.py diff --git a/utils/graph2text.py b/prove-processing/utils/graph2text.py similarity index 100% rename from utils/graph2text.py rename to prove-processing/utils/graph2text.py diff --git a/utils/lightning_base.py b/prove-processing/utils/lightning_base.py old mode 100755 new mode 100644 similarity index 100% rename from utils/lightning_base.py rename to prove-processing/utils/lightning_base.py diff --git a/utils/pagepileList.txt b/prove-processing/utils/pagepileList.txt similarity index 100% rename from utils/pagepileList.txt rename to prove-processing/utils/pagepileList.txt diff --git a/utils/sentence_retrieval_model.py b/prove-processing/utils/sentence_retrieval_model.py old mode 100755 new mode 100644 similarity index 100% rename from utils/sentence_retrieval_model.py rename to prove-processing/utils/sentence_retrieval_model.py diff --git a/utils/sentence_retrieval_module.py b/prove-processing/utils/sentence_retrieval_module.py old mode 100755 new mode 100644 similarity index 97% rename from utils/sentence_retrieval_module.py rename to prove-processing/utils/sentence_retrieval_module.py index 97343d1..1fcf7a6 --- a/utils/sentence_retrieval_module.py +++ b/prove-processing/utils/sentence_retrieval_module.py @@ -1,6 +1,7 @@ # @repo: processing # @description: BERT-based sentence relevance scorer — ranks candidate evidence sentences against a claim; used by EvidenceSelector in refs_html_to_evidences.py import re +import logging from typing import List, Tuple import pathlib @@ -8,7 +9,8 @@ from transformers import BertTokenizer from utils.sentence_retrieval_model import sentence_retrieval_model -from utils.logger import logger + +logger = logging.getLogger("prove_processing") THIS_DIR = pathlib.Path(__file__).parent.absolute() ARGS = { diff --git a/utils/textual_entailment_module.py b/prove-processing/utils/textual_entailment_module.py old mode 100755 new mode 100644 similarity index 100% rename from utils/textual_entailment_module.py rename to prove-processing/utils/textual_entailment_module.py diff --git a/utils/utils_graph2text.py b/prove-processing/utils/utils_graph2text.py old mode 100755 new mode 100644 similarity index 100% rename from utils/utils_graph2text.py rename to prove-processing/utils/utils_graph2text.py diff --git a/utils/utils_verbalisation_module.py b/prove-processing/utils/utils_verbalisation_module.py old mode 100755 new mode 100644 similarity index 100% rename from utils/utils_verbalisation_module.py rename to prove-processing/utils/utils_verbalisation_module.py diff --git a/utils/verbalisation_module.py b/prove-processing/utils/verbalisation_module.py old mode 100755 new mode 100644 similarity index 100% rename from utils/verbalisation_module.py rename to prove-processing/utils/verbalisation_module.py diff --git a/wikidata_parser.py b/prove-processing/wikidata_parser.py similarity index 99% rename from wikidata_parser.py rename to prove-processing/wikidata_parser.py index de9621c..7aee069 100644 --- a/wikidata_parser.py +++ b/prove-processing/wikidata_parser.py @@ -2,6 +2,7 @@ # @description: Extracts claims, property/object labels, and reference URLs from a Wikidata entity (QID) import ast import json +import logging from typing import List, Dict, Any import nltk @@ -16,7 +17,7 @@ InvalidEntityId ) -from utils.logger import logger +logger = logging.getLogger("prove_processing") def get_entity_dict_from_api( diff --git a/prove-shared/README.md b/prove-shared/README.md new file mode 100644 index 0000000..fddb41f --- /dev/null +++ b/prove-shared/README.md @@ -0,0 +1,71 @@ +# prove-shared + +Shared Python package for ProVe. + +This package contains common utilities used by both API and processing services: +- MongoDB handlers and shared data objects +- Auth helpers for queue/API communication +- Logging setup +- Wikidata and file helper utilities + +## Install + +From the `prove-shared` folder: + +```bash +pip install . +``` + +For editable development install: + +```bash +pip install -e . +``` + +## Import examples + +```python +from prove_shared import MongoDBHandler, Status, HtmlContent, Entailment, AsyncAuth + +# direct module imports still work +from prove_shared.mongo_handler import requestItemProcessing +``` + +## Package layout + +```text +prove-shared/ + pyproject.toml + config.yaml + src/ + prove_shared/ + __init__.py + auth.py + file_utils.py + logger.py + mongo_handler.py + objects.py + queue_manager.py + wikidata_utils.py +``` + +## Runtime secret file + +Some modules import `prove_shared.local_secrets` at runtime. + +Create a file at: + +- `src/prove_shared/local_secrets.py` (for local development), or +- `prove_shared/local_secrets.py` in the installed environment + +This file is intentionally environment-specific and should stay gitignored. + +## Build + +To build wheel/sdist: + +```bash +python -m build +``` + +(Install build first if needed: `pip install build`) diff --git a/config.yaml b/prove-shared/config.yaml similarity index 100% rename from config.yaml rename to prove-shared/config.yaml diff --git a/prove-shared/pyproject.toml b/prove-shared/pyproject.toml new file mode 100644 index 0000000..3b5b0e7 --- /dev/null +++ b/prove-shared/pyproject.toml @@ -0,0 +1,32 @@ +[build-system] +requires = ["setuptools>=69", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "prove_shared" +version = "0.1.0" +description = "Shared utilities for ProVe services" +requires-python = ">=3.10" +dependencies = [ + "pymongo>=4.6", + "pandas>=2.0", + "pyyaml>=6.0", + "cryptography>=42.0", + "requests>=2.31", + "boto3>=1.34", + "numpy>=1.24", + "qwikidata>=0.4.2", + "tqdm>=4.66", +] + +[project.optional-dependencies] +test = [ + "pytest>=8.0", +] + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] +include = ["prove_shared*"] diff --git a/prove-shared/src/prove_shared/__init__.py b/prove-shared/src/prove_shared/__init__.py new file mode 100644 index 0000000..849420e --- /dev/null +++ b/prove-shared/src/prove_shared/__init__.py @@ -0,0 +1,28 @@ +from .auth import AsyncAuth +from .mongo_handler import MongoDBHandler, requestItemProcessing +from .objects import Entailment, HtmlContent, Status +from .queue_manager import QueueManager +from .wikidata_utils import CachedWikidataAPI +from .secrets import API_KEY, ENDPOINT, LOG_FILENAME, LOG_PATH, PRIVATE_KEY + +try: + from . import local_secrets as local_secrets +except ModuleNotFoundError: + local_secrets = None + +__all__ = [ + "AsyncAuth", + "CachedWikidataAPI", + "Entailment", + "HtmlContent", + "MongoDBHandler", + "QueueManager", + "Status", + "requestItemProcessing", + "local_secrets", + "API_KEY", + "ENDPOINT", + "LOG_FILENAME", + "LOG_PATH", + "PRIVATE_KEY", +] diff --git a/utils/auth.py b/prove-shared/src/prove_shared/auth.py similarity index 98% rename from utils/auth.py rename to prove-shared/src/prove_shared/auth.py index 288c5f4..756272a 100644 --- a/utils/auth.py +++ b/prove-shared/src/prove_shared/auth.py @@ -10,7 +10,7 @@ from cryptography.hazmat.primitives.serialization import load_pem_private_key from cryptography.hazmat.primitives.serialization import load_pem_public_key -from .local_secrets import API_KEY, PRIVATE_KEY +from .secrets import API_KEY, PRIVATE_KEY class AsyncAuth: diff --git a/utils/file_utils.py b/prove-shared/src/prove_shared/file_utils.py old mode 100755 new mode 100644 similarity index 100% rename from utils/file_utils.py rename to prove-shared/src/prove_shared/file_utils.py diff --git a/utils/logger.py b/prove-shared/src/prove_shared/logger.py similarity index 89% rename from utils/logger.py rename to prove-shared/src/prove_shared/logger.py index a6c7da1..de68c54 100644 --- a/utils/logger.py +++ b/prove-shared/src/prove_shared/logger.py @@ -4,14 +4,14 @@ from logging.handlers import TimedRotatingFileHandler import os -from .local_secrets import LOG_FILENAME, LOG_PATH +from .secrets import LOG_FILENAME, LOG_PATH if not os.path.exists(LOG_PATH): try: os.makedirs(LOG_PATH, exist_ok=True) except PermissionError: - LOG_PATH = "./logs/backend/" + LOG_PATH = "./logs/prove_shared/" os.makedirs(LOG_PATH, exist_ok=True) logger = logging.getLogger("ProVe") diff --git a/utils/mongo_handler.py b/prove-shared/src/prove_shared/mongo_handler.py similarity index 99% rename from utils/mongo_handler.py rename to prove-shared/src/prove_shared/mongo_handler.py index b03a20c..405ccf0 100644 --- a/utils/mongo_handler.py +++ b/prove-shared/src/prove_shared/mongo_handler.py @@ -9,7 +9,7 @@ import pandas as pd from pymongo import MongoClient, collection, database, ReturnDocument -from utils.logger import logger +from .logger import logger class MongoDBHandler: diff --git a/utils/objects.py b/prove-shared/src/prove_shared/objects.py similarity index 100% rename from utils/objects.py rename to prove-shared/src/prove_shared/objects.py diff --git a/utils/queue_manager.py b/prove-shared/src/prove_shared/queue_manager.py similarity index 96% rename from utils/queue_manager.py rename to prove-shared/src/prove_shared/queue_manager.py index c6f90b0..146ccec 100644 --- a/utils/queue_manager.py +++ b/prove-shared/src/prove_shared/queue_manager.py @@ -4,8 +4,8 @@ import uuid import hashlib -from local_secrets import ENDPOINT, API_KEY -from auth import AsyncAuth +from .secrets import ENDPOINT, API_KEY +from .auth import AsyncAuth class QueueManager: def __init__(self, queue_name: str): diff --git a/prove-shared/src/prove_shared/secrets.py b/prove-shared/src/prove_shared/secrets.py new file mode 100644 index 0000000..0b25919 --- /dev/null +++ b/prove-shared/src/prove_shared/secrets.py @@ -0,0 +1,35 @@ +"""Runtime secrets loader with safe fallbacks. + +This module prefers values from ``prove_shared.local_secrets`` when available, +and falls back to environment variables or sensible local defaults otherwise. +""" + +from __future__ import annotations + +import os +from pathlib import Path + + +def _default_private_key_path() -> str: + return str(Path.home() / ".prove" / "keys" / "private_key") + + +try: + from .local_secrets import API_KEY as _API_KEY + from .local_secrets import ENDPOINT as _ENDPOINT + from .local_secrets import LOG_FILENAME as _LOG_FILENAME + from .local_secrets import LOG_PATH as _LOG_PATH + from .local_secrets import PRIVATE_KEY as _PRIVATE_KEY +except ModuleNotFoundError: + _API_KEY = os.getenv("PROVE_API_KEY", "") + _ENDPOINT = os.getenv("PROVE_ENDPOINT", "http://localhost/api/internal/") + _LOG_FILENAME = os.getenv("PROVE_LOG_FILENAME", "prove_shared.log") + _LOG_PATH = os.getenv("PROVE_LOG_PATH", "./logs/prove_shared/") + _PRIVATE_KEY = os.getenv("PROVE_PRIVATE_KEY", _default_private_key_path()) + + +API_KEY = _API_KEY +ENDPOINT = _ENDPOINT +LOG_FILENAME = _LOG_FILENAME +LOG_PATH = _LOG_PATH +PRIVATE_KEY = _PRIVATE_KEY diff --git a/utils/wikidata_utils.py b/prove-shared/src/prove_shared/wikidata_utils.py old mode 100755 new mode 100644 similarity index 100% rename from utils/wikidata_utils.py rename to prove-shared/src/prove_shared/wikidata_utils.py diff --git a/prove-shared/tests/conftest.py b/prove-shared/tests/conftest.py new file mode 100644 index 0000000..65f89d5 --- /dev/null +++ b/prove-shared/tests/conftest.py @@ -0,0 +1,21 @@ +from pathlib import Path +import sys +import types + + +PACKAGE_DIR = Path(__file__).resolve().parents[1] / "src" / "prove_shared" + +# Avoid executing prove_shared/__init__.py during tests; it imports runtime-heavy modules. +if "prove_shared" not in sys.modules: + pkg = types.ModuleType("prove_shared") + pkg.__path__ = [str(PACKAGE_DIR)] + sys.modules["prove_shared"] = pkg + +# Provide predictable test values for runtime secrets expected by shared modules. +secrets = types.ModuleType("prove_shared.local_secrets") +secrets.API_KEY = "test-api-key" +secrets.PRIVATE_KEY = str(PACKAGE_DIR / "_test_private_key.pem") +secrets.ENDPOINT = "http://localhost/" +secrets.LOG_FILENAME = "prove-shared-test.log" +secrets.LOG_PATH = str(PACKAGE_DIR / "_test_logs") + "/" +sys.modules["prove_shared.local_secrets"] = secrets diff --git a/prove-shared/tests/test_auth.py b/prove-shared/tests/test_auth.py new file mode 100644 index 0000000..f4d4f1b --- /dev/null +++ b/prove-shared/tests/test_auth.py @@ -0,0 +1,48 @@ +import importlib + + +def _auth_module(): + return importlib.import_module("prove_shared.auth") + + +def test_serialize_returns_base64_string(): + AsyncAuth = _auth_module().AsyncAuth + value = AsyncAuth.serialize(b"abc") + assert value == "YWJj" + + +def test_encrypt_then_decrypt_returns_original_message(monkeypatch): + auth_module = _auth_module() + AsyncAuth = auth_module.AsyncAuth + + rsa = auth_module.rsa + default_backend = auth_module.default_backend + + private_key = rsa.generate_private_key(public_exponent=65537, key_size=2048, backend=default_backend()) + public_key = private_key.public_key() + + monkeypatch.setattr(AsyncAuth, "get_private_key", classmethod(lambda cls: private_key)) + + encrypted = AsyncAuth.encrypt(public_key, "hello") + decrypted = AsyncAuth.decrypt(encrypted) + + assert isinstance(encrypted, bytes) + assert decrypted == "hello" + + +def test_is_valid_returns_true_when_decrypted_key_matches(monkeypatch): + AsyncAuth = _auth_module().AsyncAuth + monkeypatch.setattr(AsyncAuth, "decrypt", classmethod(lambda cls, payload: "test-api-key")) + + assert AsyncAuth.is_valid(b"payload") is True + + +def test_is_valid_returns_false_on_decrypt_value_error(monkeypatch): + AsyncAuth = _auth_module().AsyncAuth + + def _raise(_cls, _payload): + raise ValueError("bad payload") + + monkeypatch.setattr(AsyncAuth, "decrypt", classmethod(_raise)) + + assert AsyncAuth.is_valid(b"payload") is False diff --git a/prove-shared/tests/test_mongo_handler.py b/prove-shared/tests/test_mongo_handler.py new file mode 100644 index 0000000..8a4a574 --- /dev/null +++ b/prove-shared/tests/test_mongo_handler.py @@ -0,0 +1,119 @@ +from datetime import datetime + +from bson import ObjectId + +from prove_shared.mongo_handler import MongoDBHandler, requestItemProcessing + + +class DummyQueue: + def __init__(self, find_one_value=None, find_one_and_update_value=None, should_raise=False): + self.find_one_value = find_one_value + self.find_one_and_update_value = find_one_and_update_value + self.should_raise = should_raise + self.last_find_one_query = None + self.last_find_one_and_update_args = None + + def find_one(self, query): + self.last_find_one_query = query + if self.should_raise: + raise RuntimeError("find_one failed") + return self.find_one_value + + def find_one_and_update(self, query, update, sort=None, return_document=None): + self.last_find_one_and_update_args = { + "query": query, + "update": update, + "sort": sort, + "return_document": return_document, + } + if self.should_raise: + raise RuntimeError("find_one_and_update failed") + return self.find_one_and_update_value + + +def test_request_item_processing_returns_skip_for_existing_qid(): + queue = DummyQueue(find_one_value={"qid": "Q42", "status": "in queue"}) + + msg = requestItemProcessing( + qid="Q42", + queue=queue, + save_function=lambda _doc: None, + ) + + assert "already in queue" in msg + + +def test_request_item_processing_returns_created_message_and_calls_save(): + queue = DummyQueue(find_one_value=None) + saved = {} + + def _save(doc): + saved.update(doc) + + msg = requestItemProcessing( + qid="Q7", + queue=queue, + request_type="userRequested", + algo_version="1.2.3", + save_function=_save, + ) + + assert msg.startswith("Task ") + assert " created for QID Q7" in msg + assert saved["qid"] == "Q7" + assert saved["status"] == "in queue" + assert saved["algo_version"] == "1.2.3" + assert isinstance(saved["requested_timestamp"], datetime) + + +def test_request_item_processing_returns_error_message_on_exception(): + queue = DummyQueue(should_raise=True) + + msg = requestItemProcessing( + qid="Q99", + queue=queue, + save_function=lambda _doc: None, + ) + + assert msg.startswith("An error occurred:") + + +def test_get_next_request_returns_document_or_none(): + handler = MongoDBHandler.__new__(MongoDBHandler) + + queue_with_doc = DummyQueue(find_one_and_update_value={"qid": "Q1"}) + queue_without_doc = DummyQueue(find_one_and_update_value=None) + + assert handler.get_next_request(queue_with_doc) == {"qid": "Q1"} + assert handler.get_next_request(queue_without_doc) is None + + +def test_get_next_request_wraps_errors_in_runtime_error(): + handler = MongoDBHandler.__new__(MongoDBHandler) + bad_queue = DummyQueue(should_raise=True) + + try: + handler.get_next_request(bad_queue) + assert False, "Expected RuntimeError" + except RuntimeError as exc: + assert "Failed to get next request" in str(exc) + + +def test_get_request_by_id_converts_string_to_objectid(): + handler = MongoDBHandler.__new__(MongoDBHandler) + + class QueueSpy: + def __init__(self): + self.query = None + + def find_one(self, query): + self.query = query + return {"ok": True} + + queue = QueueSpy() + _id = str(ObjectId()) + + result = handler.get_request_by_id(queue, _id) + + assert result == {"ok": True} + assert isinstance(queue.query["_id"], ObjectId) diff --git a/prove-shared/tests/test_objects.py b/prove-shared/tests/test_objects.py new file mode 100644 index 0000000..7f36519 --- /dev/null +++ b/prove-shared/tests/test_objects.py @@ -0,0 +1,98 @@ +from datetime import datetime, timedelta + +from bson import ObjectId + +from prove_shared.objects import Entailment, HtmlContent, Status + + +def test_status_comparisons_and_formatted_timestamp_returns_expected_values(): + now = datetime.utcnow() + later = now + timedelta(minutes=1) + + s1 = Status( + _id=ObjectId(), + qid="Q1", + task_id="t1", + status="in queue", + algo_version="1.0", + request_type="userRequested", + requested_timestamp=now, + processing_start_timestamp=now, + completed_timestamp=now, + last_updated=now, + ) + s2 = Status( + _id=ObjectId(), + qid="Q1", + task_id="t2", + status="completed", + algo_version="1.0", + request_type="userRequested", + requested_timestamp=later, + processing_start_timestamp=later, + completed_timestamp=later, + last_updated=later, + ) + + assert s1 < s2 + assert s2 > s1 + assert s1 <= now + assert s2 >= later + assert s1 == now + assert s1.get_formated_requested_timestamp().endswith("Z") + + +def test_htmlcontent_get_item_returns_error_payload_for_non_200_status(): + content = HtmlContent( + reference_id="ref-1", + task_id="task-1", + entity_label="Entity", + object_label="Object", + property_label="prop", + status=404, + url="https://example.org", + lang="en", + object_id="Q123", + property_id="P31", + ) + + item = content.get_item() + + assert item["qid"] == "Q123" + assert item["property_id"] == "P31" + assert item["result"] == "error" + assert "HTTP Error code: 404" in item["result_sentence"] + + +def test_htmlcontent_add_info_item_updates_result_fields(): + content = HtmlContent( + reference_id="ref-2", + task_id="task-2", + entity_label="Entity", + object_label="Object", + property_label="prop", + status=200, + url="https://example.org", + lang="en", + object_id="Q456", + property_id="P279", + ) + + entailment = Entailment( + _id=ObjectId(), + text_entailment_score=0.9, + similarity_score=0.8, + processed_timestamp=datetime.utcnow(), + result="SUPPORTS", + result_sentence="Evidence sentence", + reference_id="ref-2", + label_probabilities={"SUPPORTS": 0.9, "REFUTES": 0.05, "NOT ENOUGH INFO": 0.05}, + task_id="task-2", + save_timestamp=datetime.utcnow(), + ) + + content.add_info_item(entailment) + item = content.get_item() + + assert item["result"] == "SUPPORTS" + assert item["result_sentence"].endswith("/ Evidence sentence") diff --git a/pyproject.toml b/pyproject.toml index 9da5d23..2e8ebfd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ readme = "README.md" requires-python = "==3.10.16" dependencies = [ + "prove_shared @ file:./prove-shared", "pandas", "pyyaml", "plotly",