diff --git a/scotus-pull.py b/scotus-pull.py index 691b97f..36ecdc5 100755 --- a/scotus-pull.py +++ b/scotus-pull.py @@ -2,20 +2,40 @@ import datetime import re import sys -import dateparser -import requests -from typing import NoReturn -from PySide6.QtCore import QCoreApplication, QModelIndex, Signal, Qt, Slot -from PySide6.QtSql import QSqlDatabase, QSqlQuery, QSqlQueryModel -from PySide6.QtWidgets import QAbstractItemView, QApplication, QHeaderView, QMainWindow, QStyledItemDelegate, QTableWidgetItem -from bs4 import BeautifulSoup, Tag +from PySide6.QtCore import ( + QCoreApplication, + QModelIndex, + QPersistentModelIndex, + QRect, + QSize, + Signal, + Slot, +) +from PySide6.QtGui import QPainter, QTextDocument +from PySide6.QtSql import ( + QSqlDatabase, + QSqlQuery, + QSqlTableModel, +) +from PySide6.QtWidgets import ( + QAbstractItemView, + QApplication, + QHeaderView, + QMainWindow, + QStyle, + QStyledItemDelegate, + QStyleOptionViewItem, +) from docketModel import docketModel -from ui.MainWindow import Ui_MainWindow from lib.utils import query_error +from ui.MainWindow import Ui_MainWindow +from workers import updateThread + translate = QCoreApplication.translate + class dateDelegate(QStyledItemDelegate): def displayText(self, value, locale) -> str: date = datetime.date.fromtimestamp(value) @@ -183,168 +203,7 @@ def schema_update(db: QSqlDatabase) -> None: db.commit() return -def update_proceedings(case_id: int, bs: BeautifulSoup) -> None: - table = bs.find('table', id="proceedings") - assert isinstance(table, Tag) - trs = table.find_all('tr') - tr = trs.pop(0) - query = QSqlQuery() - while len(trs) > 0: - tr = trs.pop(0) - assert isinstance(tr, Tag) - td = tr.contents[0] - assert isinstance(td, Tag) and isinstance(td.string, str) - date = dateparser.parse(td.string) - td = tr.contents[1] - assert isinstance(td, Tag) and isinstance(td.string, str) - text = td.string.strip() - query.prepare("SELECT * FROM entries WHERE case_id = :cid AND date = :date AND text=:text") - query.bindValue(':cid', case_id) - query.bindValue(':text', text) - assert isinstance(date, datetime.date) - query.bindValue(':date', date.timestamp()) - if not query.exec(): - query_error(query) - if not query.next(): - query.prepare("INSERT INTO entries (case_id, date, text) VALUES (:cid,:date,:text)") - query.bindValue(':cid', case_id) - query.bindValue(':date', date.timestamp()) - query.bindValue(':text', text) - if not query.exec(): - query_error(query) - entry_id = query.lastInsertId() - else: - entry_id = query.value(0) - tr = trs.pop(0) - assert isinstance(tr, Tag) - assert isinstance(tr.contents[1], Tag) - for a in tr.contents[1]: - assert isinstance(a, Tag) - url = a.attrs['href'] - name = a.string - query.prepare("SELECT * FROM documents WHERE url=:url AND entry_id = :eid") - query.bindValue(':url', url) - query.bindValue(":eid", entry_id) - if not query.exec(): - query_error(query) - if not query.next(): - query.prepare("INSERT INTO documents (entry_id, name, url) " - "VALUES (:eid, :name, :url)") - query.bindValue(":eid", entry_id) - query.bindValue(":name", name) - query.bindValue(":url", url) - if not query.exec(): - query_error(query) - return -def update_db(case_id) -> int: - r = requests.get('https://www.supremecourt.gov/docket/docketfiles/html/public/{}.html'.format(case_id)) - if r.status_code != 200: - print(r.status_code) - exit(1) - bs = BeautifulSoup(r.text,'lxml') - # - # docket_id, previous_docket, petitioners, respondents, date - # all come from the docketinfo table - # - di = bs.find('table',id='docketinfo') - assert di is not None and isinstance(di, Tag) - - # - # docket_id is first row, first column - docket_id = di.find('span') - assert docket_id is not None and isinstance(docket_id, Tag) - docket_id = docket_id.contents[0] - assert isinstance(docket_id, str) - docket_id = docket_id.strip() - docket_id = docket_id.replace('No. ','') - - # - # Title is second row, first column - tr = di.contents[1] - assert isinstance(tr, Tag) and isinstance(tr.contents[0], Tag) - assert tr.contents[0].string == 'Title:' - td = tr.contents[1] - assert isinstance(td, Tag) - span = td.contents[0] - assert isinstance(span, Tag) and isinstance(span.contents[0], str) - petitioners = span.contents[0].strip() - # - # XXX - We need to deal with other titles. Change this to an RE - # UPDATED: we are just handling the two we know about. - # - petitioners = petitioners.replace(', Petitioners','') - petitioners = petitioners.replace(', Applicants','') - assert isinstance(span.contents[4], str) - respondent = span.contents[4].strip() - - # - # Date on which the case was docketed - tr = di.contents[2] - assert isinstance(tr,Tag) and isinstance(tr.contents[1], Tag) - td = tr.contents[1] - assert isinstance(td, Tag) and td.string is not None - docket_date = td.string.strip() - date = dateparser.parse(docket_date) - - # - # linked case is row 3, column 0 - tr = di.contents[3] - assert isinstance(tr, Tag) and isinstance(tr.contents[0], Tag) - linked = tr.contents[0].string - - # - # See if this case already exists. - # - query = QSqlQuery() - query.prepare("SELECT * FROM cases WHERE docket_id = :did") - query.bindValue(':did', docket_id) - if not query.exec(): - query_error(query) - - # - # if it does not exists, create it. This stops a recursion loop. - # - if not query.next(): - query.prepare("INSERT INTO cases (docket_id, petitioners, respondents, date, linked) " - "VALUES (:did, :pet, :resp, :date, NULL)") - query.bindValue(':did', docket_id) - query.bindValue(':pet', petitioners) - query.bindValue(':resp', respondent) - assert isinstance(date, datetime.date) - query.bindValue(':date', date.timestamp()) - if not query.exec(): - query_error(query) - case_id = query.lastInsertId() - linked_id = None - else: - case_id = query.value(0) - linked_id = query.value('linked') - assert isinstance(case_id, int) - # - # If there is a linked case, we need to get the ID for that case. - if linked is not None: - linked = linked.replace('Linked with ','') - query.prepare("SELECT * FROM cases WHERE docket_id = :did") - query.bindValue(':did', linked) - if not query.exec(): - query_error(query) - if not query.next(): - new_id = update_db(linked) - else: - new_id = query.value(0) - if new_id != linked_id: - query.prepare("UPDATE cases SET linked=:lid WHERE case_id = :cid") - query.bindValue(':lid', new_id) - query.bindValue(':cid', case_id) - if not query.exec(): - query_error(query) - # - # XXX - Process lower courts - # - update_proceedings(case_id, bs) - return(case_id) - def main() -> int: app = QApplication(sys.argv) db = QSqlDatabase.addDatabase("QSQLITE") @@ -352,8 +211,6 @@ def main() -> int: db.setDatabaseName("scotus.db") db.open() schema_update(db) - update_db('24-203') - update_db('23A1058') window = MainWindow() return app.exec() diff --git a/workers.py b/workers.py new file mode 100644 index 0000000..b1f9307 --- /dev/null +++ b/workers.py @@ -0,0 +1,232 @@ +import datetime +from re import template + +import dateparser +import requests +from bs4 import BeautifulSoup, Tag +from PySide6.QtCore import QThread +from PySide6.QtSql import QSqlDatabase, QSqlQuery + +from lib.utils import query_error + + +def update_proceedings(case_id: int, bs: BeautifulSoup) -> bool: + table = bs.find("table", id="proceedings") + assert isinstance(table, Tag) + trs = table.find_all("tr") + tr = trs.pop(0) + query = QSqlQuery(QSqlDatabase.database("update")) + while len(trs) > 0: + tr = trs.pop(0) + assert isinstance(tr, Tag) + td = tr.contents[0] + assert isinstance(td, Tag) and isinstance(td.string, str) + date = dateparser.parse(td.string) + td = tr.contents[1] + assert isinstance(td, Tag) and isinstance(td.string, str) + text = td.string.strip() + query.prepare( + "SELECT * FROM entries WHERE case_id = :cid AND date = :date AND text=:text" + ) + query.bindValue(":cid", case_id) + query.bindValue(":text", text) + assert isinstance(date, datetime.date) + query.bindValue(":date", date.timestamp()) + if not query.exec(): + query_error(query) + if not query.next(): + query.prepare( + "INSERT INTO entries (case_id, date, text) VALUES (:cid,:date,:text)" + ) + query.bindValue(":cid", case_id) + query.bindValue(":date", date.timestamp()) + query.bindValue(":text", text) + if not query.exec(): + query_error(query) + entry_id = query.lastInsertId() + else: + entry_id = query.value(0) + tr = trs.pop(0) + assert isinstance(tr, Tag) + assert isinstance(tr.contents[1], Tag) + for a in tr.contents[1]: + assert isinstance(a, Tag) + url = a.attrs["href"] + name = a.string + query.prepare( + "SELECT * FROM documents WHERE url=:url AND entry_id = :eid" + ) + query.bindValue(":url", url) + query.bindValue(":eid", entry_id) + if not query.exec(): + query_error(query) + if not query.next(): + query.prepare( + "INSERT INTO documents (entry_id, name, url) " + "VALUES (:eid, :name, :url)" + ) + query.bindValue(":eid", entry_id) + query.bindValue(":name", name) + query.bindValue(":url", url) + if not query.exec(): + query_error(query) + print(f"text: {text.lower()}") + result = not text.lower() in [ + "petition denied.", + ] + return result + + +def update_db(case_id) -> int: + # + # See if this case already exists. + # + # We assume that case_id == docket_id at this point. If it does not, + # then we will build out from the request we get + + query = QSqlQuery(QSqlDatabase.database("update")) + query.prepare("SELECT * FROM cases WHERE docket_id = :did") + query.bindValue(":did", case_id) + if not query.exec(): + query_error(query) + exists = query.next() + if exists: + active = query.value("active") == 1 + else: + active = True + if not active: + return query.value("case_id") + + r = requests.get( + f"https://www.supremecourt.gov/docket/docketfiles/html/public/{case_id}.html" + ) + if r.status_code != 200: + print(r.status_code) + exit(1) + bs = BeautifulSoup(r.text, "lxml") + # + # docket_id, previous_docket, petitioners, respondents, date + # all come from the docketinfo table + # + di = bs.find("table", id="docketinfo") + assert di is not None and isinstance(di, Tag) + + # + # docket_id is first row, first column + span = di.find("span") + assert span is not None and isinstance(span, Tag) + tmp = span.contents[0] + assert isinstance(tmp, str) + docket_id = tmp.strip() + docket_id = docket_id.replace("No. ", "") + + # + # Title is second row, first column + tr = di.contents[1] + assert isinstance(tr, Tag) and isinstance(tr.contents[0], Tag) + assert tr.contents[0].string == "Title:" + td = tr.contents[1] + assert isinstance(td, Tag) + span = td.contents[0] + assert isinstance(span, Tag) and isinstance(span.contents[0], str) + petitioners = span.contents[0].strip() + # + # XXX - We need to deal with other titles. Change this to an RE + # UPDATED: we are just handling the two we know about. + # + petitioners = petitioners.replace(", Petitioners", "") + petitioners = petitioners.replace(", Applicants", "") + assert isinstance(span.contents[4], str) + respondent = span.contents[4].strip() + + # + # Date on which the case was docketed + tr = di.contents[2] + assert isinstance(tr, Tag) and isinstance(tr.contents[1], Tag) + td = tr.contents[1] + assert isinstance(td, Tag) and td.string is not None + docket_date = td.string.strip() + date = dateparser.parse(docket_date) + + # + # linked case is row 3, column 0 + tr = di.contents[3] + assert isinstance(tr, Tag) and isinstance(tr.contents[0], Tag) + linked = tr.contents[0].string + + # + # if it does not exists, create it. This stops a recursion loop. + # + if not exists: + query.prepare( + "INSERT INTO cases (docket_id, petitioners, respondents, date, active, linked) " + "VALUES (:did, :pet, :resp, :date, 1, NULL)" + ) + query.bindValue(":did", docket_id) + query.bindValue(":pet", petitioners) + query.bindValue(":resp", respondent) + assert isinstance(date, datetime.date) + query.bindValue(":date", date.timestamp()) + if not query.exec(): + query_error(query) + case_id = query.lastInsertId() + linked_id = None + else: + case_id = query.value(0) + linked_id = query.value("linked") + assert isinstance(case_id, int) + # + # If there is a linked case, we need to get the ID for that case. + if linked is not None: + linked = linked.replace("Linked with ", "") + query.prepare("SELECT * FROM cases WHERE docket_id = :did") + query.bindValue(":did", linked) + if not query.exec(): + query_error(query) + if not query.next(): + new_id = update_db(linked) + else: + new_id = query.value(0) + if new_id != linked_id: + query.prepare("UPDATE cases SET linked=:lid WHERE case_id = :cid") + query.bindValue(":lid", new_id) + query.bindValue(":cid", case_id) + if not query.exec(): + query_error(query) + # + # XXX - Process lower courts + # + active = update_proceedings(case_id, bs) + if not active: + query.prepare("UPDATE cases SET active=0 WHERE case_id = :cid") + query.bindValue(":cid", case_id) + if not query.exec(): + query_error(query) + return case_id + + +class updateThread(QThread): + docket_id = None + + def __init__(self): + super(updateThread, self).__init__() + print("updateThread: __init__(docket_id)") + return + + def setDocketId(self, docket_id): + self.docket_id = docket_id + return + + def run(self): + print("updateThread: run()") + if not "update" in QSqlDatabase.connectionNames(): + db = QSqlDatabase.cloneDatabase( + "qt_sql_default_connection", "update" + ) + if not db.open(): + print(db.lastError()) + raise Exception("db.open()") + + case_id = update_db(self.docket_id) + print(f"updateThread: run() returns {case_id}") + self.exit(1)