monzo-ingestion/src/ingestion/db.rs
Joshua Coles d11e4fd0c4
All checks were successful
Build and Publish / Build and Test (push) Successful in 9m12s
Allow specifying the account ID and fix bug when delivering batched rows
2025-12-28 12:04:53 +00:00

337 lines
11 KiB
Rust

use crate::error::AppError;
use crate::ingestion::ingestion_logic::MonzoRow;
use anyhow::anyhow;
use entity::{expenditure, transaction};
use sea_orm::sea_query::OnConflict;
use sea_orm::{
ColumnTrait, DatabaseConnection, DbErr, EntityTrait, Iterable, QuerySelect, TransactionTrait,
};
use sea_orm::{ConnectionTrait, DatabaseTransaction, QueryFilter};
pub struct Insertion {
pub transaction: transaction::ActiveModel,
pub contained_expenditures: Vec<expenditure::ActiveModel>,
pub identity_hash: i64,
}
// Note while this is more efficient in db calls, it does bind together the entire group.
// We employ a batching process for now to try balance speed and failure rate, but it is worth
// trying to move failures earlier and improve reporting.
pub async fn insert(
db: &DatabaseConnection,
monzo_rows: Vec<MonzoRow>,
account_id: i32,
) -> Result<Vec<String>, AppError> {
let mut new_transaction_ids = Vec::new();
let insertions = monzo_rows
.into_iter()
.map(|row| MonzoRow::into_insertion(row, account_id))
.collect::<Result<Vec<_>, _>>()?;
for insertions in insertions.chunks(200) {
let (new_or_updated_insertions, inserted_transaction_ids) =
whittle_insertions(insertions, db).await?;
if new_or_updated_insertions.is_empty() {
continue;
}
let tx = db.begin().await?;
update_transactions(&tx, &new_or_updated_insertions).await?;
update_expenditures(&tx, &new_or_updated_insertions, &inserted_transaction_ids).await?;
tx.commit().await?;
// We wait until the transaction is committed before adding the new transaction ids to the
// list to avoid issues with the transaction being rolled back.
new_transaction_ids.extend(inserted_transaction_ids);
}
// Notify the new transactions once everything is committed.
notify_new_transactions(db, &new_transaction_ids).await?;
Ok(new_transaction_ids)
}
async fn update_expenditures(
tx: &DatabaseTransaction,
new_or_updated_insertions: &[&Insertion],
inserted_transaction_ids: &[String],
) -> Result<(), AppError> {
if new_or_updated_insertions.is_empty() {
return Ok(());
}
// Expenditures can change as we re-categorise them, so we delete all the old ones and
// insert an entirely new set to ensure we don't end up leaving old ones around.
expenditure::Entity::delete_many()
.filter(expenditure::Column::TransactionId.is_in(inserted_transaction_ids))
.exec(tx)
.await?;
expenditure::Entity::insert_many(
new_or_updated_insertions
.iter()
.flat_map(|i| &i.contained_expenditures)
.cloned(),
)
.on_conflict(
OnConflict::columns(vec![
expenditure::Column::TransactionId,
expenditure::Column::Category,
])
.update_columns(expenditure::Column::iter())
.to_owned(),
)
.exec(tx)
.await?;
Ok(())
}
async fn update_transactions(
tx: &DatabaseTransaction,
new_or_updated_insertions: &[&Insertion],
) -> Result<(), DbErr> {
if new_or_updated_insertions.is_empty() {
return Ok(());
}
let transactions = new_or_updated_insertions
.iter()
.map(|i| &i.transaction)
.cloned();
transaction::Entity::insert_many(transactions)
.on_conflict(
OnConflict::column(transaction::Column::Id)
.update_columns(transaction::Column::iter())
.to_owned(),
)
.exec(tx)
.await?;
Ok(())
}
async fn whittle_insertions<'a>(
insertions: &'a [Insertion],
tx: &DatabaseConnection,
) -> Result<(Vec<&'a Insertion>, Vec<String>), AppError> {
let existing_hashes = transaction::Entity::find()
.select_only()
.columns([transaction::Column::IdentityHash])
.filter(transaction::Column::IdentityHash.is_not_null())
.into_tuple::<(i64,)>()
.all(tx)
.await?;
tracing::debug!("Found existing entries: {existing_hashes:?}");
// We will only update those where the hash is different to avoid unnecessary updates and
// notifications.
let new_or_updated_insertions = insertions
.iter()
.filter(|i| {
let hash = i.identity_hash;
!existing_hashes
.iter()
.any(|(existing_hash,)| *existing_hash == hash)
})
.collect::<Vec<_>>();
let inserted_transaction_ids = new_or_updated_insertions
.iter()
.map(|i| i.transaction.id.clone().unwrap())
.collect::<Vec<_>>();
Ok((new_or_updated_insertions, inserted_transaction_ids))
}
async fn notify_new_transactions(
db: &DatabaseConnection,
new_transaction_ids: &[String],
) -> Result<(), AppError> {
let payload = serde_json::to_string(&new_transaction_ids).map_err(|e| anyhow!(e))?;
db.execute_unprepared(&format!(r#"NOTIFY monzo_new_transactions, '{payload}'"#))
.await?;
Ok(())
}
mod tests {
use super::{insert, notify_new_transactions, update_expenditures, update_transactions};
use crate::ingestion::ingestion_logic::from_json_row;
use anyhow::Error;
use entity::account;
use migration::MigratorTrait;
use sea_orm::{ActiveModelTrait, DatabaseConnection, TransactionTrait};
use serde_json::Value;
use sqlx::postgres::PgListener;
use sqlx::{Executor, PgPool};
use testcontainers::runners::AsyncRunner;
use testcontainers::ContainerAsync;
use tokio::sync::OnceCell;
#[derive(Debug)]
struct DatabaseInstance {
container: ContainerAsync<testcontainers_modules::postgres::Postgres>,
db: DatabaseConnection,
pool: PgPool,
}
static INSTANCE: OnceCell<DatabaseInstance> = OnceCell::const_new();
async fn initialise_db() -> Result<DatabaseInstance, Error> {
let container = testcontainers_modules::postgres::Postgres::default()
.start()
.await?;
// prepare connection string
let connection_string = &format!(
"postgres://postgres:postgres@127.0.0.1:{}/postgres",
container.get_host_port_ipv4(5432).await?
);
let db: DatabaseConnection = sea_orm::Database::connect(connection_string).await?;
migration::Migrator::up(&db, None).await?;
let pool = PgPool::connect(connection_string).await?;
let instance = DatabaseInstance {
container,
db,
pool,
};
Ok(instance)
}
async fn get_or_initialize_db_instance() -> Result<&'static DatabaseInstance, Error> {
Ok(INSTANCE
.get_or_init(|| async { initialise_db().await.unwrap() })
.await)
}
async fn create_test_account(db: &DatabaseConnection) -> Result<i32, Error> {
let new_account = account::ActiveModel {
id: sea_orm::ActiveValue::NotSet,
name: sea_orm::ActiveValue::Set("Test Account".to_string()),
};
let inserted = new_account.insert(db).await?;
Ok(inserted.id)
}
#[tokio::test]
async fn test_empty_insertion_list() -> Result<(), Error> {
let db = get_or_initialize_db_instance().await?;
let insertions = vec![];
let tx = db.db.begin().await?;
update_transactions(&tx, &insertions).await?;
update_expenditures(&tx, &insertions, &vec![]).await?;
tx.commit().await?;
Ok(())
}
#[tokio::test]
async fn test_notify() -> Result<(), Error> {
let dbi = get_or_initialize_db_instance().await?;
let mut listener = PgListener::connect_with(&dbi.pool).await?;
listener.listen("monzo_new_transactions").await?;
let ids = vec![
"test1".to_string(),
"test2".to_string(),
"test3".to_string(),
];
notify_new_transactions(&dbi.db, &ids).await?;
let notification = listener.recv().await?;
let payload = notification.payload();
println!("Payload: {}", payload);
assert_eq!(
serde_json::from_str::<Vec<String>>(&payload)?,
ids,
"Payloads do not match"
);
Ok(())
}
#[tokio::test]
async fn test_notify_on_insert() -> Result<(), Error> {
let dbi = get_or_initialize_db_instance().await?;
let account_id = create_test_account(&dbi.db).await?;
let mut listener = PgListener::connect_with(&dbi.pool).await?;
listener.listen("monzo_new_transactions").await?;
let json = include_str!("../../fixtures/transactions.json");
let json: Vec<Vec<Value>> = serde_json::from_str(json).unwrap();
let data = json
.iter()
.map(|row| from_json_row(row))
.collect::<Result<Vec<_>, anyhow::Error>>()?;
insert(&dbi.db, data.clone(), account_id).await?;
let notification = listener.recv().await?;
let payload = notification.payload();
let mut payload = serde_json::from_str::<Vec<String>>(&payload)?;
payload.sort();
let mut ids = data
.iter()
.map(|row| row.transaction_id.clone())
.collect::<Vec<_>>();
ids.sort();
assert_eq!(payload, ids, "Inserted IDs do not match");
insert(&dbi.db, data.clone(), account_id).await?;
let notification = listener.recv().await?;
let payload = notification.payload();
let payload = serde_json::from_str::<Vec<String>>(&payload)?;
assert_eq!(
payload,
Vec::<String>::new(),
"Re-inserting identical rows triggered double notification"
);
let mut altered_data = data.clone();
altered_data[0].description = Some("New description".to_string());
assert_ne!(
altered_data[0].compute_hash(),
data[0].compute_hash(),
"Alterations have the same hash"
);
insert(&dbi.db, altered_data.clone(), account_id).await?;
let notification = listener.recv().await?;
let payload = notification.payload();
let payload = serde_json::from_str::<Vec<String>>(&payload)?;
assert_eq!(
payload,
vec![altered_data[0].transaction_id.clone()],
"Re-inserting altered row failed to re-trigger notification"
);
Ok(())
}
}
pub(crate) async fn get_account_id(
p0: &DatabaseConnection,
p1: Option<String>,
) -> Result<i32, AppError> {
let p1 = p1.unwrap_or("Monzo".to_string());
entity::prelude::Account::find()
.filter(entity::account::Column::Name.eq(p1))
.select_only()
.column(entity::account::Column::Id)
.into_tuple::<i32>()
.one(p0)
.await?
.ok_or(AppError::BadRequest(anyhow!("Account not found")))
}