Split database work to separte Thread
This commit is contained in:
199
scotus-pull.py
199
scotus-pull.py
@@ -2,20 +2,40 @@
|
||||
import datetime
|
||||
import re
|
||||
import sys
|
||||
import dateparser
|
||||
import requests
|
||||
from typing import NoReturn
|
||||
from PySide6.QtCore import QCoreApplication, QModelIndex, Signal, Qt, Slot
|
||||
from PySide6.QtSql import QSqlDatabase, QSqlQuery, QSqlQueryModel
|
||||
from PySide6.QtWidgets import QAbstractItemView, QApplication, QHeaderView, QMainWindow, QStyledItemDelegate, QTableWidgetItem
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from PySide6.QtCore import (
|
||||
QCoreApplication,
|
||||
QModelIndex,
|
||||
QPersistentModelIndex,
|
||||
QRect,
|
||||
QSize,
|
||||
Signal,
|
||||
Slot,
|
||||
)
|
||||
from PySide6.QtGui import QPainter, QTextDocument
|
||||
from PySide6.QtSql import (
|
||||
QSqlDatabase,
|
||||
QSqlQuery,
|
||||
QSqlTableModel,
|
||||
)
|
||||
from PySide6.QtWidgets import (
|
||||
QAbstractItemView,
|
||||
QApplication,
|
||||
QHeaderView,
|
||||
QMainWindow,
|
||||
QStyle,
|
||||
QStyledItemDelegate,
|
||||
QStyleOptionViewItem,
|
||||
)
|
||||
|
||||
from docketModel import docketModel
|
||||
from ui.MainWindow import Ui_MainWindow
|
||||
from lib.utils import query_error
|
||||
from ui.MainWindow import Ui_MainWindow
|
||||
from workers import updateThread
|
||||
|
||||
translate = QCoreApplication.translate
|
||||
|
||||
|
||||
class dateDelegate(QStyledItemDelegate):
|
||||
def displayText(self, value, locale) -> str:
|
||||
date = datetime.date.fromtimestamp(value)
|
||||
@@ -183,167 +203,6 @@ def schema_update(db: QSqlDatabase) -> None:
|
||||
db.commit()
|
||||
return
|
||||
|
||||
def update_proceedings(case_id: int, bs: BeautifulSoup) -> None:
|
||||
table = bs.find('table', id="proceedings")
|
||||
assert isinstance(table, Tag)
|
||||
trs = table.find_all('tr')
|
||||
tr = trs.pop(0)
|
||||
query = QSqlQuery()
|
||||
while len(trs) > 0:
|
||||
tr = trs.pop(0)
|
||||
assert isinstance(tr, Tag)
|
||||
td = tr.contents[0]
|
||||
assert isinstance(td, Tag) and isinstance(td.string, str)
|
||||
date = dateparser.parse(td.string)
|
||||
td = tr.contents[1]
|
||||
assert isinstance(td, Tag) and isinstance(td.string, str)
|
||||
text = td.string.strip()
|
||||
query.prepare("SELECT * FROM entries WHERE case_id = :cid AND date = :date AND text=:text")
|
||||
query.bindValue(':cid', case_id)
|
||||
query.bindValue(':text', text)
|
||||
assert isinstance(date, datetime.date)
|
||||
query.bindValue(':date', date.timestamp())
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
if not query.next():
|
||||
query.prepare("INSERT INTO entries (case_id, date, text) VALUES (:cid,:date,:text)")
|
||||
query.bindValue(':cid', case_id)
|
||||
query.bindValue(':date', date.timestamp())
|
||||
query.bindValue(':text', text)
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
entry_id = query.lastInsertId()
|
||||
else:
|
||||
entry_id = query.value(0)
|
||||
tr = trs.pop(0)
|
||||
assert isinstance(tr, Tag)
|
||||
assert isinstance(tr.contents[1], Tag)
|
||||
for a in tr.contents[1]:
|
||||
assert isinstance(a, Tag)
|
||||
url = a.attrs['href']
|
||||
name = a.string
|
||||
query.prepare("SELECT * FROM documents WHERE url=:url AND entry_id = :eid")
|
||||
query.bindValue(':url', url)
|
||||
query.bindValue(":eid", entry_id)
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
if not query.next():
|
||||
query.prepare("INSERT INTO documents (entry_id, name, url) "
|
||||
"VALUES (:eid, :name, :url)")
|
||||
query.bindValue(":eid", entry_id)
|
||||
query.bindValue(":name", name)
|
||||
query.bindValue(":url", url)
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
return
|
||||
|
||||
def update_db(case_id) -> int:
|
||||
r = requests.get('https://www.supremecourt.gov/docket/docketfiles/html/public/{}.html'.format(case_id))
|
||||
if r.status_code != 200:
|
||||
print(r.status_code)
|
||||
exit(1)
|
||||
bs = BeautifulSoup(r.text,'lxml')
|
||||
#
|
||||
# docket_id, previous_docket, petitioners, respondents, date
|
||||
# all come from the docketinfo table
|
||||
#
|
||||
di = bs.find('table',id='docketinfo')
|
||||
assert di is not None and isinstance(di, Tag)
|
||||
|
||||
#
|
||||
# docket_id is first row, first column
|
||||
docket_id = di.find('span')
|
||||
assert docket_id is not None and isinstance(docket_id, Tag)
|
||||
docket_id = docket_id.contents[0]
|
||||
assert isinstance(docket_id, str)
|
||||
docket_id = docket_id.strip()
|
||||
docket_id = docket_id.replace('No. ','')
|
||||
|
||||
#
|
||||
# Title is second row, first column
|
||||
tr = di.contents[1]
|
||||
assert isinstance(tr, Tag) and isinstance(tr.contents[0], Tag)
|
||||
assert tr.contents[0].string == 'Title:'
|
||||
td = tr.contents[1]
|
||||
assert isinstance(td, Tag)
|
||||
span = td.contents[0]
|
||||
assert isinstance(span, Tag) and isinstance(span.contents[0], str)
|
||||
petitioners = span.contents[0].strip()
|
||||
#
|
||||
# XXX - We need to deal with other titles. Change this to an RE
|
||||
# UPDATED: we are just handling the two we know about.
|
||||
#
|
||||
petitioners = petitioners.replace(', Petitioners','')
|
||||
petitioners = petitioners.replace(', Applicants','')
|
||||
assert isinstance(span.contents[4], str)
|
||||
respondent = span.contents[4].strip()
|
||||
|
||||
#
|
||||
# Date on which the case was docketed
|
||||
tr = di.contents[2]
|
||||
assert isinstance(tr,Tag) and isinstance(tr.contents[1], Tag)
|
||||
td = tr.contents[1]
|
||||
assert isinstance(td, Tag) and td.string is not None
|
||||
docket_date = td.string.strip()
|
||||
date = dateparser.parse(docket_date)
|
||||
|
||||
#
|
||||
# linked case is row 3, column 0
|
||||
tr = di.contents[3]
|
||||
assert isinstance(tr, Tag) and isinstance(tr.contents[0], Tag)
|
||||
linked = tr.contents[0].string
|
||||
|
||||
#
|
||||
# See if this case already exists.
|
||||
#
|
||||
query = QSqlQuery()
|
||||
query.prepare("SELECT * FROM cases WHERE docket_id = :did")
|
||||
query.bindValue(':did', docket_id)
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
|
||||
#
|
||||
# if it does not exists, create it. This stops a recursion loop.
|
||||
#
|
||||
if not query.next():
|
||||
query.prepare("INSERT INTO cases (docket_id, petitioners, respondents, date, linked) "
|
||||
"VALUES (:did, :pet, :resp, :date, NULL)")
|
||||
query.bindValue(':did', docket_id)
|
||||
query.bindValue(':pet', petitioners)
|
||||
query.bindValue(':resp', respondent)
|
||||
assert isinstance(date, datetime.date)
|
||||
query.bindValue(':date', date.timestamp())
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
case_id = query.lastInsertId()
|
||||
linked_id = None
|
||||
else:
|
||||
case_id = query.value(0)
|
||||
linked_id = query.value('linked')
|
||||
assert isinstance(case_id, int)
|
||||
#
|
||||
# If there is a linked case, we need to get the ID for that case.
|
||||
if linked is not None:
|
||||
linked = linked.replace('Linked with ','')
|
||||
query.prepare("SELECT * FROM cases WHERE docket_id = :did")
|
||||
query.bindValue(':did', linked)
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
if not query.next():
|
||||
new_id = update_db(linked)
|
||||
else:
|
||||
new_id = query.value(0)
|
||||
if new_id != linked_id:
|
||||
query.prepare("UPDATE cases SET linked=:lid WHERE case_id = :cid")
|
||||
query.bindValue(':lid', new_id)
|
||||
query.bindValue(':cid', case_id)
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
#
|
||||
# XXX - Process lower courts
|
||||
#
|
||||
update_proceedings(case_id, bs)
|
||||
return(case_id)
|
||||
|
||||
def main() -> int:
|
||||
app = QApplication(sys.argv)
|
||||
@@ -352,8 +211,6 @@ def main() -> int:
|
||||
db.setDatabaseName("scotus.db")
|
||||
db.open()
|
||||
schema_update(db)
|
||||
update_db('24-203')
|
||||
update_db('23A1058')
|
||||
window = MainWindow()
|
||||
return app.exec()
|
||||
|
||||
|
||||
232
workers.py
Normal file
232
workers.py
Normal file
@@ -0,0 +1,232 @@
|
||||
import datetime
|
||||
from re import template
|
||||
|
||||
import dateparser
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from PySide6.QtCore import QThread
|
||||
from PySide6.QtSql import QSqlDatabase, QSqlQuery
|
||||
|
||||
from lib.utils import query_error
|
||||
|
||||
|
||||
def update_proceedings(case_id: int, bs: BeautifulSoup) -> bool:
|
||||
table = bs.find("table", id="proceedings")
|
||||
assert isinstance(table, Tag)
|
||||
trs = table.find_all("tr")
|
||||
tr = trs.pop(0)
|
||||
query = QSqlQuery(QSqlDatabase.database("update"))
|
||||
while len(trs) > 0:
|
||||
tr = trs.pop(0)
|
||||
assert isinstance(tr, Tag)
|
||||
td = tr.contents[0]
|
||||
assert isinstance(td, Tag) and isinstance(td.string, str)
|
||||
date = dateparser.parse(td.string)
|
||||
td = tr.contents[1]
|
||||
assert isinstance(td, Tag) and isinstance(td.string, str)
|
||||
text = td.string.strip()
|
||||
query.prepare(
|
||||
"SELECT * FROM entries WHERE case_id = :cid AND date = :date AND text=:text"
|
||||
)
|
||||
query.bindValue(":cid", case_id)
|
||||
query.bindValue(":text", text)
|
||||
assert isinstance(date, datetime.date)
|
||||
query.bindValue(":date", date.timestamp())
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
if not query.next():
|
||||
query.prepare(
|
||||
"INSERT INTO entries (case_id, date, text) VALUES (:cid,:date,:text)"
|
||||
)
|
||||
query.bindValue(":cid", case_id)
|
||||
query.bindValue(":date", date.timestamp())
|
||||
query.bindValue(":text", text)
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
entry_id = query.lastInsertId()
|
||||
else:
|
||||
entry_id = query.value(0)
|
||||
tr = trs.pop(0)
|
||||
assert isinstance(tr, Tag)
|
||||
assert isinstance(tr.contents[1], Tag)
|
||||
for a in tr.contents[1]:
|
||||
assert isinstance(a, Tag)
|
||||
url = a.attrs["href"]
|
||||
name = a.string
|
||||
query.prepare(
|
||||
"SELECT * FROM documents WHERE url=:url AND entry_id = :eid"
|
||||
)
|
||||
query.bindValue(":url", url)
|
||||
query.bindValue(":eid", entry_id)
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
if not query.next():
|
||||
query.prepare(
|
||||
"INSERT INTO documents (entry_id, name, url) "
|
||||
"VALUES (:eid, :name, :url)"
|
||||
)
|
||||
query.bindValue(":eid", entry_id)
|
||||
query.bindValue(":name", name)
|
||||
query.bindValue(":url", url)
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
print(f"text: {text.lower()}")
|
||||
result = not text.lower() in [
|
||||
"petition denied.",
|
||||
]
|
||||
return result
|
||||
|
||||
|
||||
def update_db(case_id) -> int:
|
||||
#
|
||||
# See if this case already exists.
|
||||
#
|
||||
# We assume that case_id == docket_id at this point. If it does not,
|
||||
# then we will build out from the request we get
|
||||
|
||||
query = QSqlQuery(QSqlDatabase.database("update"))
|
||||
query.prepare("SELECT * FROM cases WHERE docket_id = :did")
|
||||
query.bindValue(":did", case_id)
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
exists = query.next()
|
||||
if exists:
|
||||
active = query.value("active") == 1
|
||||
else:
|
||||
active = True
|
||||
if not active:
|
||||
return query.value("case_id")
|
||||
|
||||
r = requests.get(
|
||||
f"https://www.supremecourt.gov/docket/docketfiles/html/public/{case_id}.html"
|
||||
)
|
||||
if r.status_code != 200:
|
||||
print(r.status_code)
|
||||
exit(1)
|
||||
bs = BeautifulSoup(r.text, "lxml")
|
||||
#
|
||||
# docket_id, previous_docket, petitioners, respondents, date
|
||||
# all come from the docketinfo table
|
||||
#
|
||||
di = bs.find("table", id="docketinfo")
|
||||
assert di is not None and isinstance(di, Tag)
|
||||
|
||||
#
|
||||
# docket_id is first row, first column
|
||||
span = di.find("span")
|
||||
assert span is not None and isinstance(span, Tag)
|
||||
tmp = span.contents[0]
|
||||
assert isinstance(tmp, str)
|
||||
docket_id = tmp.strip()
|
||||
docket_id = docket_id.replace("No. ", "")
|
||||
|
||||
#
|
||||
# Title is second row, first column
|
||||
tr = di.contents[1]
|
||||
assert isinstance(tr, Tag) and isinstance(tr.contents[0], Tag)
|
||||
assert tr.contents[0].string == "Title:"
|
||||
td = tr.contents[1]
|
||||
assert isinstance(td, Tag)
|
||||
span = td.contents[0]
|
||||
assert isinstance(span, Tag) and isinstance(span.contents[0], str)
|
||||
petitioners = span.contents[0].strip()
|
||||
#
|
||||
# XXX - We need to deal with other titles. Change this to an RE
|
||||
# UPDATED: we are just handling the two we know about.
|
||||
#
|
||||
petitioners = petitioners.replace(", Petitioners", "")
|
||||
petitioners = petitioners.replace(", Applicants", "")
|
||||
assert isinstance(span.contents[4], str)
|
||||
respondent = span.contents[4].strip()
|
||||
|
||||
#
|
||||
# Date on which the case was docketed
|
||||
tr = di.contents[2]
|
||||
assert isinstance(tr, Tag) and isinstance(tr.contents[1], Tag)
|
||||
td = tr.contents[1]
|
||||
assert isinstance(td, Tag) and td.string is not None
|
||||
docket_date = td.string.strip()
|
||||
date = dateparser.parse(docket_date)
|
||||
|
||||
#
|
||||
# linked case is row 3, column 0
|
||||
tr = di.contents[3]
|
||||
assert isinstance(tr, Tag) and isinstance(tr.contents[0], Tag)
|
||||
linked = tr.contents[0].string
|
||||
|
||||
#
|
||||
# if it does not exists, create it. This stops a recursion loop.
|
||||
#
|
||||
if not exists:
|
||||
query.prepare(
|
||||
"INSERT INTO cases (docket_id, petitioners, respondents, date, active, linked) "
|
||||
"VALUES (:did, :pet, :resp, :date, 1, NULL)"
|
||||
)
|
||||
query.bindValue(":did", docket_id)
|
||||
query.bindValue(":pet", petitioners)
|
||||
query.bindValue(":resp", respondent)
|
||||
assert isinstance(date, datetime.date)
|
||||
query.bindValue(":date", date.timestamp())
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
case_id = query.lastInsertId()
|
||||
linked_id = None
|
||||
else:
|
||||
case_id = query.value(0)
|
||||
linked_id = query.value("linked")
|
||||
assert isinstance(case_id, int)
|
||||
#
|
||||
# If there is a linked case, we need to get the ID for that case.
|
||||
if linked is not None:
|
||||
linked = linked.replace("Linked with ", "")
|
||||
query.prepare("SELECT * FROM cases WHERE docket_id = :did")
|
||||
query.bindValue(":did", linked)
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
if not query.next():
|
||||
new_id = update_db(linked)
|
||||
else:
|
||||
new_id = query.value(0)
|
||||
if new_id != linked_id:
|
||||
query.prepare("UPDATE cases SET linked=:lid WHERE case_id = :cid")
|
||||
query.bindValue(":lid", new_id)
|
||||
query.bindValue(":cid", case_id)
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
#
|
||||
# XXX - Process lower courts
|
||||
#
|
||||
active = update_proceedings(case_id, bs)
|
||||
if not active:
|
||||
query.prepare("UPDATE cases SET active=0 WHERE case_id = :cid")
|
||||
query.bindValue(":cid", case_id)
|
||||
if not query.exec():
|
||||
query_error(query)
|
||||
return case_id
|
||||
|
||||
|
||||
class updateThread(QThread):
|
||||
docket_id = None
|
||||
|
||||
def __init__(self):
|
||||
super(updateThread, self).__init__()
|
||||
print("updateThread: __init__(docket_id)")
|
||||
return
|
||||
|
||||
def setDocketId(self, docket_id):
|
||||
self.docket_id = docket_id
|
||||
return
|
||||
|
||||
def run(self):
|
||||
print("updateThread: run()")
|
||||
if not "update" in QSqlDatabase.connectionNames():
|
||||
db = QSqlDatabase.cloneDatabase(
|
||||
"qt_sql_default_connection", "update"
|
||||
)
|
||||
if not db.open():
|
||||
print(db.lastError())
|
||||
raise Exception("db.open()")
|
||||
|
||||
case_id = update_db(self.docket_id)
|
||||
print(f"updateThread: run() returns {case_id}")
|
||||
self.exit(1)
|
||||
Reference in New Issue
Block a user