Parse date from emails

master
Nick Krichevsky 2022-10-08 16:37:30 -04:00
parent 0bb543eda1
commit c739436c3a
5 changed files with 136 additions and 32 deletions

32
Cargo.lock generated
View File

@ -1372,9 +1372,9 @@ dependencies = [
[[package]]
name = "regex"
version = "1.5.5"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286"
checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b"
dependencies = [
"aho-corasick",
"memchr",
@ -1383,9 +1383,9 @@ dependencies = [
[[package]]
name = "regex-syntax"
version = "0.6.25"
version = "0.6.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244"
[[package]]
name = "remove_dir_all"
@ -1677,6 +1677,28 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "test-case"
version = "2.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21d6cf5a7dffb3f9dceec8e6b8ca528d9bd71d36c9f074defb548ce161f598c0"
dependencies = [
"test-case-macros",
]
[[package]]
name = "test-case-macros"
version = "2.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e45b7bf6e19353ddd832745c8fcf77a17a93171df7151187f26623f2b75b5b26"
dependencies = [
"cfg-if",
"proc-macro-error",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "textwrap"
version = "0.15.0"
@ -2073,11 +2095,13 @@ dependencies = [
"itertools",
"log",
"mailparse",
"regex",
"scraper",
"serde",
"serde_yaml",
"simplelog",
"stop-token",
"test-case",
"textwrap",
"thiserror",
"tokio",

View File

@ -20,6 +20,7 @@ scraper = "0.13.0"
itertools = "0.10.4"
fern = "0.6.1"
chrono = "0.4.22"
regex = "1.6"
# For annoying reasons, we must pin exactly the same versions as async-imap if we want to use
# their types.
@ -32,3 +33,4 @@ stop-token = "0.7"
[dev-dependencies]
textwrap = "0.15.0"
futures-timer = "3.0"
test-case = "2.2"

View File

@ -1,5 +1,6 @@
//! Provides utilities to parse emails for transactions.
pub use chrono::naive::NaiveDate;
pub use citi::EmailParser as CitiEmailParser;
pub use td::EmailParser as TDEmailParser;
@ -19,6 +20,7 @@ pub struct Error(String);
pub struct Transaction {
payee: String,
amount: String,
date: NaiveDate,
}
/// A `TransactionEmailParser` will parse a given email for transaction details
@ -41,6 +43,11 @@ impl Transaction {
pub fn amount(&self) -> &str {
&self.amount
}
#[must_use]
pub fn date(&self) -> &NaiveDate {
&self.date
}
}
/// perform a BFS to find a text/html element; this is likely where the actual fun parts of the email are.

View File

@ -3,7 +3,7 @@ use scraper::{Html, Selector};
use crate::Message;
use super::{Transaction, TransactionEmailParser};
use super::{NaiveDate, Transaction, TransactionEmailParser};
pub struct EmailParser;
@ -50,11 +50,13 @@ impl TransactionEmailParser for EmailParser {
.flat_map(|element| element.text());
let amount = find_amount_from_table_text(td_text_iter.clone())?;
let payee = find_payee_from_table_text(td_text_iter)?;
let payee = find_payee_from_table_text(td_text_iter.clone())?;
let date = find_date_from_table_text(td_text_iter)?;
let trans = Transaction {
amount: amount.to_string(),
payee: payee.to_string(),
date,
};
Ok(trans)
@ -64,16 +66,34 @@ impl TransactionEmailParser for EmailParser {
fn find_payee_from_table_text<'a, I>(table_text_iter: I) -> Result<&'a str, super::Error>
where
I: Iterator<Item = &'a str>,
{
find_table_value_with_label(table_text_iter, |label| label == "Merchant")
.ok_or_else(|| super::Error("failed to find merchant in html body".to_string()))
}
fn find_date_from_table_text<'a, I>(table_text_iter: I) -> Result<NaiveDate, super::Error>
where
I: Iterator<Item = &'a str>,
{
let date_text = find_table_value_with_label(table_text_iter, |label| label == "Date")
.ok_or_else(|| super::Error("failed to find date in html body".to_string()))?;
NaiveDate::parse_from_str(dbg!(date_text), "%m/%d/%Y")
.map_err(|err| super::Error(format!("failed to parse date from html body: {:?}", err)))
}
fn find_table_value_with_label<'a, I, F>(table_text_iter: I, mut find_func: F) -> Option<&'a str>
where
I: Iterator<Item = &'a str>,
F: FnMut(&'a str) -> bool,
{
// The Citi emails have two parallel tables, with a heading on the left side and a value on the right.
// In our list, this ends up as something like [..., "Merchant", "The Store", ...]
// So we iterate in pairs until we find what we want.
let maybe_merchant = table_text_iter
table_text_iter
.tuples()
.find(|&(label, _)| label == "Merchant")
.map(|(_, value)| value.trim());
maybe_merchant.ok_or_else(|| super::Error("failed to find merchant in html body".to_string()))
.find(|&(label, _)| find_func(label))
.map(|(_, value)| value.trim())
}
fn find_amount_from_table_text<'a, I>(mut table_text_iter: I) -> Result<&'a str, super::Error>
@ -103,7 +123,8 @@ mod tests {
assert_eq!(
Transaction {
amount: "$3.28".to_string(),
payee: "STOP & SHOP".to_string()
payee: "STOP & SHOP".to_string(),
date: NaiveDate::from_ymd(2022, 9, 13),
},
transaction
);

View File

@ -1,11 +1,18 @@
use regex::Regex;
use scraper::{Html, Selector};
use crate::Message;
use super::{Transaction, TransactionEmailParser};
use super::{NaiveDate, Transaction, TransactionEmailParser};
pub struct EmailParser;
// UndatedTransaction is similar to a transaction, but it does not have a date.
struct UndatedTransaction {
payee: String,
amount: String,
}
impl EmailParser {
#[must_use]
pub fn new() -> Self {
@ -33,32 +40,74 @@ impl TransactionEmailParser for EmailParser {
let html_contents = html_part
.get_body()
.map_err(|err| super::Error(format!("failed to find html body: {:?}", err)))?;
// If this is malformed it's programmer error
let ul_selector = Selector::parse("ul").expect("failed to create selector to parse email");
let li_selector =
Selector::parse("ul > li").expect("failed to create selector to parse email");
let html_document = Html::parse_document(&html_contents);
let lis = html_document
.select(&ul_selector)
.flat_map(|ul| ul.select(&li_selector))
.flat_map(|li| li.text())
.collect::<Vec<_>>();
if lis.len() != 2 {
return Err(super::Error(
"email did not follow expected HTML structure; could not find list of details"
.to_string(),
));
}
let html_document = Html::parse_document(&html_contents);
let date = extract_date(&html_document)?;
let undated_info = extract_undated_info(&html_document)?;
let trans = Transaction {
payee: lis[0].trim_start_matches("Merchant Name: ").to_string(),
amount: format!("${}", lis[1].trim_start_matches("Amount: ")),
payee: undated_info.payee,
amount: undated_info.amount,
date,
};
Ok(trans)
}
}
fn extract_date(email_html: &Html) -> Result<NaiveDate, super::Error> {
// If these are malformed it's programmer error
let td_selector = Selector::parse("td").expect("failed to create selector to parse email");
let date_regexp = Regex::new(r"\d{4}-\d{2}-\d{2}").expect("failed to create regex for date");
// Somewhere in the email, there will be a div with the text "A purchase was made on yyyy-mm-dd, which is
// greater than...". We can find this line and parse it out; there isn't enough specificity in the email
// to select it manually.
let date_line = email_html
.select(&td_selector)
.flat_map(|div| div.text())
.find(|item| item.starts_with("A purchase was made on "))
.ok_or_else(|| {
super::Error("failed to find element containing date in email".to_string())
})?;
// We cannot pass any kind of "ignore text after" to `NaiveDate::parse_from_str`, and while we could mess around
// with splitting and such, it will be less brittle to just use a regular expression and then re-parse.
let date_text_match = date_regexp
.find(date_line)
.ok_or_else(|| super::Error("failed to extract date from email text".to_string()))?;
let date_text = &date_line[date_text_match.start()..date_text_match.end()];
NaiveDate::parse_from_str(date_text, "%F")
.map_err(|err| super::Error(format!("failed to parse date from email: {:?}", err)))
}
fn extract_undated_info(email_html: &Html) -> Result<UndatedTransaction, super::Error> {
// If these are mmalformed it's a programmer rror
let ul_selector = Selector::parse("ul").expect("failed to create selector to parse email");
let li_selector = Selector::parse("ul > li").expect("failed to create selector to parse email");
let lis = email_html
.select(&ul_selector)
.flat_map(|ul| ul.select(&li_selector))
.flat_map(|li| li.text())
.collect::<Vec<_>>();
if lis.len() != 2 {
return Err(super::Error(
"email did not follow expected HTML structure; could not find list of details"
.to_string(),
));
}
let undated_trans = UndatedTransaction {
payee: lis[0].trim_start_matches("Merchant Name: ").to_string(),
amount: format!("${}", lis[1].trim_start_matches("Amount: ")),
};
Ok(undated_trans)
}
#[cfg(test)]
mod tests {
use super::*;
@ -74,7 +123,8 @@ mod tests {
assert_eq!(
Transaction {
amount: "$0.60".to_string(),
payee: "PAYPAL *MARKETPLACE".to_string()
payee: "PAYPAL *MARKETPLACE".to_string(),
date: NaiveDate::from_ymd(2022, 9, 17)
},
transaction
);