Split database work to separte Thread

This commit is contained in:
Christopher T. Johnson
2025-02-08 16:42:05 -05:00
parent 58f7e1b59c
commit ffc840dc66
2 changed files with 260 additions and 171 deletions

View File

@@ -2,20 +2,40 @@
import datetime
import re
import sys
import dateparser
import requests
from typing import NoReturn
from PySide6.QtCore import QCoreApplication, QModelIndex, Signal, Qt, Slot
from PySide6.QtSql import QSqlDatabase, QSqlQuery, QSqlQueryModel
from PySide6.QtWidgets import QAbstractItemView, QApplication, QHeaderView, QMainWindow, QStyledItemDelegate, QTableWidgetItem
from bs4 import BeautifulSoup, Tag
from PySide6.QtCore import (
QCoreApplication,
QModelIndex,
QPersistentModelIndex,
QRect,
QSize,
Signal,
Slot,
)
from PySide6.QtGui import QPainter, QTextDocument
from PySide6.QtSql import (
QSqlDatabase,
QSqlQuery,
QSqlTableModel,
)
from PySide6.QtWidgets import (
QAbstractItemView,
QApplication,
QHeaderView,
QMainWindow,
QStyle,
QStyledItemDelegate,
QStyleOptionViewItem,
)
from docketModel import docketModel
from ui.MainWindow import Ui_MainWindow
from lib.utils import query_error
from ui.MainWindow import Ui_MainWindow
from workers import updateThread
translate = QCoreApplication.translate
class dateDelegate(QStyledItemDelegate):
def displayText(self, value, locale) -> str:
date = datetime.date.fromtimestamp(value)
@@ -183,168 +203,7 @@ def schema_update(db: QSqlDatabase) -> None:
db.commit()
return
def update_proceedings(case_id: int, bs: BeautifulSoup) -> None:
table = bs.find('table', id="proceedings")
assert isinstance(table, Tag)
trs = table.find_all('tr')
tr = trs.pop(0)
query = QSqlQuery()
while len(trs) > 0:
tr = trs.pop(0)
assert isinstance(tr, Tag)
td = tr.contents[0]
assert isinstance(td, Tag) and isinstance(td.string, str)
date = dateparser.parse(td.string)
td = tr.contents[1]
assert isinstance(td, Tag) and isinstance(td.string, str)
text = td.string.strip()
query.prepare("SELECT * FROM entries WHERE case_id = :cid AND date = :date AND text=:text")
query.bindValue(':cid', case_id)
query.bindValue(':text', text)
assert isinstance(date, datetime.date)
query.bindValue(':date', date.timestamp())
if not query.exec():
query_error(query)
if not query.next():
query.prepare("INSERT INTO entries (case_id, date, text) VALUES (:cid,:date,:text)")
query.bindValue(':cid', case_id)
query.bindValue(':date', date.timestamp())
query.bindValue(':text', text)
if not query.exec():
query_error(query)
entry_id = query.lastInsertId()
else:
entry_id = query.value(0)
tr = trs.pop(0)
assert isinstance(tr, Tag)
assert isinstance(tr.contents[1], Tag)
for a in tr.contents[1]:
assert isinstance(a, Tag)
url = a.attrs['href']
name = a.string
query.prepare("SELECT * FROM documents WHERE url=:url AND entry_id = :eid")
query.bindValue(':url', url)
query.bindValue(":eid", entry_id)
if not query.exec():
query_error(query)
if not query.next():
query.prepare("INSERT INTO documents (entry_id, name, url) "
"VALUES (:eid, :name, :url)")
query.bindValue(":eid", entry_id)
query.bindValue(":name", name)
query.bindValue(":url", url)
if not query.exec():
query_error(query)
return
def update_db(case_id) -> int:
r = requests.get('https://www.supremecourt.gov/docket/docketfiles/html/public/{}.html'.format(case_id))
if r.status_code != 200:
print(r.status_code)
exit(1)
bs = BeautifulSoup(r.text,'lxml')
#
# docket_id, previous_docket, petitioners, respondents, date
# all come from the docketinfo table
#
di = bs.find('table',id='docketinfo')
assert di is not None and isinstance(di, Tag)
#
# docket_id is first row, first column
docket_id = di.find('span')
assert docket_id is not None and isinstance(docket_id, Tag)
docket_id = docket_id.contents[0]
assert isinstance(docket_id, str)
docket_id = docket_id.strip()
docket_id = docket_id.replace('No. ','')
#
# Title is second row, first column
tr = di.contents[1]
assert isinstance(tr, Tag) and isinstance(tr.contents[0], Tag)
assert tr.contents[0].string == 'Title:'
td = tr.contents[1]
assert isinstance(td, Tag)
span = td.contents[0]
assert isinstance(span, Tag) and isinstance(span.contents[0], str)
petitioners = span.contents[0].strip()
#
# XXX - We need to deal with other titles. Change this to an RE
# UPDATED: we are just handling the two we know about.
#
petitioners = petitioners.replace(', Petitioners','')
petitioners = petitioners.replace(', Applicants','')
assert isinstance(span.contents[4], str)
respondent = span.contents[4].strip()
#
# Date on which the case was docketed
tr = di.contents[2]
assert isinstance(tr,Tag) and isinstance(tr.contents[1], Tag)
td = tr.contents[1]
assert isinstance(td, Tag) and td.string is not None
docket_date = td.string.strip()
date = dateparser.parse(docket_date)
#
# linked case is row 3, column 0
tr = di.contents[3]
assert isinstance(tr, Tag) and isinstance(tr.contents[0], Tag)
linked = tr.contents[0].string
#
# See if this case already exists.
#
query = QSqlQuery()
query.prepare("SELECT * FROM cases WHERE docket_id = :did")
query.bindValue(':did', docket_id)
if not query.exec():
query_error(query)
#
# if it does not exists, create it. This stops a recursion loop.
#
if not query.next():
query.prepare("INSERT INTO cases (docket_id, petitioners, respondents, date, linked) "
"VALUES (:did, :pet, :resp, :date, NULL)")
query.bindValue(':did', docket_id)
query.bindValue(':pet', petitioners)
query.bindValue(':resp', respondent)
assert isinstance(date, datetime.date)
query.bindValue(':date', date.timestamp())
if not query.exec():
query_error(query)
case_id = query.lastInsertId()
linked_id = None
else:
case_id = query.value(0)
linked_id = query.value('linked')
assert isinstance(case_id, int)
#
# If there is a linked case, we need to get the ID for that case.
if linked is not None:
linked = linked.replace('Linked with ','')
query.prepare("SELECT * FROM cases WHERE docket_id = :did")
query.bindValue(':did', linked)
if not query.exec():
query_error(query)
if not query.next():
new_id = update_db(linked)
else:
new_id = query.value(0)
if new_id != linked_id:
query.prepare("UPDATE cases SET linked=:lid WHERE case_id = :cid")
query.bindValue(':lid', new_id)
query.bindValue(':cid', case_id)
if not query.exec():
query_error(query)
#
# XXX - Process lower courts
#
update_proceedings(case_id, bs)
return(case_id)
def main() -> int:
app = QApplication(sys.argv)
db = QSqlDatabase.addDatabase("QSQLITE")
@@ -352,8 +211,6 @@ def main() -> int:
db.setDatabaseName("scotus.db")
db.open()
schema_update(db)
update_db('24-203')
update_db('23A1058')
window = MainWindow()
return app.exec()

232
workers.py Normal file
View File

@@ -0,0 +1,232 @@
import datetime
from re import template
import dateparser
import requests
from bs4 import BeautifulSoup, Tag
from PySide6.QtCore import QThread
from PySide6.QtSql import QSqlDatabase, QSqlQuery
from lib.utils import query_error
def update_proceedings(case_id: int, bs: BeautifulSoup) -> bool:
table = bs.find("table", id="proceedings")
assert isinstance(table, Tag)
trs = table.find_all("tr")
tr = trs.pop(0)
query = QSqlQuery(QSqlDatabase.database("update"))
while len(trs) > 0:
tr = trs.pop(0)
assert isinstance(tr, Tag)
td = tr.contents[0]
assert isinstance(td, Tag) and isinstance(td.string, str)
date = dateparser.parse(td.string)
td = tr.contents[1]
assert isinstance(td, Tag) and isinstance(td.string, str)
text = td.string.strip()
query.prepare(
"SELECT * FROM entries WHERE case_id = :cid AND date = :date AND text=:text"
)
query.bindValue(":cid", case_id)
query.bindValue(":text", text)
assert isinstance(date, datetime.date)
query.bindValue(":date", date.timestamp())
if not query.exec():
query_error(query)
if not query.next():
query.prepare(
"INSERT INTO entries (case_id, date, text) VALUES (:cid,:date,:text)"
)
query.bindValue(":cid", case_id)
query.bindValue(":date", date.timestamp())
query.bindValue(":text", text)
if not query.exec():
query_error(query)
entry_id = query.lastInsertId()
else:
entry_id = query.value(0)
tr = trs.pop(0)
assert isinstance(tr, Tag)
assert isinstance(tr.contents[1], Tag)
for a in tr.contents[1]:
assert isinstance(a, Tag)
url = a.attrs["href"]
name = a.string
query.prepare(
"SELECT * FROM documents WHERE url=:url AND entry_id = :eid"
)
query.bindValue(":url", url)
query.bindValue(":eid", entry_id)
if not query.exec():
query_error(query)
if not query.next():
query.prepare(
"INSERT INTO documents (entry_id, name, url) "
"VALUES (:eid, :name, :url)"
)
query.bindValue(":eid", entry_id)
query.bindValue(":name", name)
query.bindValue(":url", url)
if not query.exec():
query_error(query)
print(f"text: {text.lower()}")
result = not text.lower() in [
"petition denied.",
]
return result
def update_db(case_id) -> int:
#
# See if this case already exists.
#
# We assume that case_id == docket_id at this point. If it does not,
# then we will build out from the request we get
query = QSqlQuery(QSqlDatabase.database("update"))
query.prepare("SELECT * FROM cases WHERE docket_id = :did")
query.bindValue(":did", case_id)
if not query.exec():
query_error(query)
exists = query.next()
if exists:
active = query.value("active") == 1
else:
active = True
if not active:
return query.value("case_id")
r = requests.get(
f"https://www.supremecourt.gov/docket/docketfiles/html/public/{case_id}.html"
)
if r.status_code != 200:
print(r.status_code)
exit(1)
bs = BeautifulSoup(r.text, "lxml")
#
# docket_id, previous_docket, petitioners, respondents, date
# all come from the docketinfo table
#
di = bs.find("table", id="docketinfo")
assert di is not None and isinstance(di, Tag)
#
# docket_id is first row, first column
span = di.find("span")
assert span is not None and isinstance(span, Tag)
tmp = span.contents[0]
assert isinstance(tmp, str)
docket_id = tmp.strip()
docket_id = docket_id.replace("No. ", "")
#
# Title is second row, first column
tr = di.contents[1]
assert isinstance(tr, Tag) and isinstance(tr.contents[0], Tag)
assert tr.contents[0].string == "Title:"
td = tr.contents[1]
assert isinstance(td, Tag)
span = td.contents[0]
assert isinstance(span, Tag) and isinstance(span.contents[0], str)
petitioners = span.contents[0].strip()
#
# XXX - We need to deal with other titles. Change this to an RE
# UPDATED: we are just handling the two we know about.
#
petitioners = petitioners.replace(", Petitioners", "")
petitioners = petitioners.replace(", Applicants", "")
assert isinstance(span.contents[4], str)
respondent = span.contents[4].strip()
#
# Date on which the case was docketed
tr = di.contents[2]
assert isinstance(tr, Tag) and isinstance(tr.contents[1], Tag)
td = tr.contents[1]
assert isinstance(td, Tag) and td.string is not None
docket_date = td.string.strip()
date = dateparser.parse(docket_date)
#
# linked case is row 3, column 0
tr = di.contents[3]
assert isinstance(tr, Tag) and isinstance(tr.contents[0], Tag)
linked = tr.contents[0].string
#
# if it does not exists, create it. This stops a recursion loop.
#
if not exists:
query.prepare(
"INSERT INTO cases (docket_id, petitioners, respondents, date, active, linked) "
"VALUES (:did, :pet, :resp, :date, 1, NULL)"
)
query.bindValue(":did", docket_id)
query.bindValue(":pet", petitioners)
query.bindValue(":resp", respondent)
assert isinstance(date, datetime.date)
query.bindValue(":date", date.timestamp())
if not query.exec():
query_error(query)
case_id = query.lastInsertId()
linked_id = None
else:
case_id = query.value(0)
linked_id = query.value("linked")
assert isinstance(case_id, int)
#
# If there is a linked case, we need to get the ID for that case.
if linked is not None:
linked = linked.replace("Linked with ", "")
query.prepare("SELECT * FROM cases WHERE docket_id = :did")
query.bindValue(":did", linked)
if not query.exec():
query_error(query)
if not query.next():
new_id = update_db(linked)
else:
new_id = query.value(0)
if new_id != linked_id:
query.prepare("UPDATE cases SET linked=:lid WHERE case_id = :cid")
query.bindValue(":lid", new_id)
query.bindValue(":cid", case_id)
if not query.exec():
query_error(query)
#
# XXX - Process lower courts
#
active = update_proceedings(case_id, bs)
if not active:
query.prepare("UPDATE cases SET active=0 WHERE case_id = :cid")
query.bindValue(":cid", case_id)
if not query.exec():
query_error(query)
return case_id
class updateThread(QThread):
docket_id = None
def __init__(self):
super(updateThread, self).__init__()
print("updateThread: __init__(docket_id)")
return
def setDocketId(self, docket_id):
self.docket_id = docket_id
return
def run(self):
print("updateThread: run()")
if not "update" in QSqlDatabase.connectionNames():
db = QSqlDatabase.cloneDatabase(
"qt_sql_default_connection", "update"
)
if not db.open():
print(db.lastError())
raise Exception("db.open()")
case_id = update_db(self.docket_id)
print(f"updateThread: run() returns {case_id}")
self.exit(1)