Parse date from emails
parent
0bb543eda1
commit
c739436c3a
|
@ -1372,9 +1372,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.5.5"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286"
|
||||
checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
|
@ -1383,9 +1383,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.25"
|
||||
version = "0.6.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
|
||||
checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244"
|
||||
|
||||
[[package]]
|
||||
name = "remove_dir_all"
|
||||
|
@ -1677,6 +1677,28 @@ dependencies = [
|
|||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "test-case"
|
||||
version = "2.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "21d6cf5a7dffb3f9dceec8e6b8ca528d9bd71d36c9f074defb548ce161f598c0"
|
||||
dependencies = [
|
||||
"test-case-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "test-case-macros"
|
||||
version = "2.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e45b7bf6e19353ddd832745c8fcf77a17a93171df7151187f26623f2b75b5b26"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"proc-macro-error",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "textwrap"
|
||||
version = "0.15.0"
|
||||
|
@ -2073,11 +2095,13 @@ dependencies = [
|
|||
"itertools",
|
||||
"log",
|
||||
"mailparse",
|
||||
"regex",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_yaml",
|
||||
"simplelog",
|
||||
"stop-token",
|
||||
"test-case",
|
||||
"textwrap",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
|
|
|
@ -20,6 +20,7 @@ scraper = "0.13.0"
|
|||
itertools = "0.10.4"
|
||||
fern = "0.6.1"
|
||||
chrono = "0.4.22"
|
||||
regex = "1.6"
|
||||
|
||||
# For annoying reasons, we must pin exactly the same versions as async-imap if we want to use
|
||||
# their types.
|
||||
|
@ -32,3 +33,4 @@ stop-token = "0.7"
|
|||
[dev-dependencies]
|
||||
textwrap = "0.15.0"
|
||||
futures-timer = "3.0"
|
||||
test-case = "2.2"
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
//! Provides utilities to parse emails for transactions.
|
||||
|
||||
pub use chrono::naive::NaiveDate;
|
||||
pub use citi::EmailParser as CitiEmailParser;
|
||||
pub use td::EmailParser as TDEmailParser;
|
||||
|
||||
|
@ -19,6 +20,7 @@ pub struct Error(String);
|
|||
pub struct Transaction {
|
||||
payee: String,
|
||||
amount: String,
|
||||
date: NaiveDate,
|
||||
}
|
||||
|
||||
/// A `TransactionEmailParser` will parse a given email for transaction details
|
||||
|
@ -41,6 +43,11 @@ impl Transaction {
|
|||
pub fn amount(&self) -> &str {
|
||||
&self.amount
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn date(&self) -> &NaiveDate {
|
||||
&self.date
|
||||
}
|
||||
}
|
||||
|
||||
/// perform a BFS to find a text/html element; this is likely where the actual fun parts of the email are.
|
||||
|
|
|
@ -3,7 +3,7 @@ use scraper::{Html, Selector};
|
|||
|
||||
use crate::Message;
|
||||
|
||||
use super::{Transaction, TransactionEmailParser};
|
||||
use super::{NaiveDate, Transaction, TransactionEmailParser};
|
||||
|
||||
pub struct EmailParser;
|
||||
|
||||
|
@ -50,11 +50,13 @@ impl TransactionEmailParser for EmailParser {
|
|||
.flat_map(|element| element.text());
|
||||
|
||||
let amount = find_amount_from_table_text(td_text_iter.clone())?;
|
||||
let payee = find_payee_from_table_text(td_text_iter)?;
|
||||
let payee = find_payee_from_table_text(td_text_iter.clone())?;
|
||||
let date = find_date_from_table_text(td_text_iter)?;
|
||||
|
||||
let trans = Transaction {
|
||||
amount: amount.to_string(),
|
||||
payee: payee.to_string(),
|
||||
date,
|
||||
};
|
||||
|
||||
Ok(trans)
|
||||
|
@ -64,16 +66,34 @@ impl TransactionEmailParser for EmailParser {
|
|||
fn find_payee_from_table_text<'a, I>(table_text_iter: I) -> Result<&'a str, super::Error>
|
||||
where
|
||||
I: Iterator<Item = &'a str>,
|
||||
{
|
||||
find_table_value_with_label(table_text_iter, |label| label == "Merchant")
|
||||
.ok_or_else(|| super::Error("failed to find merchant in html body".to_string()))
|
||||
}
|
||||
|
||||
fn find_date_from_table_text<'a, I>(table_text_iter: I) -> Result<NaiveDate, super::Error>
|
||||
where
|
||||
I: Iterator<Item = &'a str>,
|
||||
{
|
||||
let date_text = find_table_value_with_label(table_text_iter, |label| label == "Date")
|
||||
.ok_or_else(|| super::Error("failed to find date in html body".to_string()))?;
|
||||
|
||||
NaiveDate::parse_from_str(dbg!(date_text), "%m/%d/%Y")
|
||||
.map_err(|err| super::Error(format!("failed to parse date from html body: {:?}", err)))
|
||||
}
|
||||
|
||||
fn find_table_value_with_label<'a, I, F>(table_text_iter: I, mut find_func: F) -> Option<&'a str>
|
||||
where
|
||||
I: Iterator<Item = &'a str>,
|
||||
F: FnMut(&'a str) -> bool,
|
||||
{
|
||||
// The Citi emails have two parallel tables, with a heading on the left side and a value on the right.
|
||||
// In our list, this ends up as something like [..., "Merchant", "The Store", ...]
|
||||
// So we iterate in pairs until we find what we want.
|
||||
let maybe_merchant = table_text_iter
|
||||
table_text_iter
|
||||
.tuples()
|
||||
.find(|&(label, _)| label == "Merchant")
|
||||
.map(|(_, value)| value.trim());
|
||||
|
||||
maybe_merchant.ok_or_else(|| super::Error("failed to find merchant in html body".to_string()))
|
||||
.find(|&(label, _)| find_func(label))
|
||||
.map(|(_, value)| value.trim())
|
||||
}
|
||||
|
||||
fn find_amount_from_table_text<'a, I>(mut table_text_iter: I) -> Result<&'a str, super::Error>
|
||||
|
@ -103,7 +123,8 @@ mod tests {
|
|||
assert_eq!(
|
||||
Transaction {
|
||||
amount: "$3.28".to_string(),
|
||||
payee: "STOP & SHOP".to_string()
|
||||
payee: "STOP & SHOP".to_string(),
|
||||
date: NaiveDate::from_ymd(2022, 9, 13),
|
||||
},
|
||||
transaction
|
||||
);
|
||||
|
|
|
@ -1,11 +1,18 @@
|
|||
use regex::Regex;
|
||||
use scraper::{Html, Selector};
|
||||
|
||||
use crate::Message;
|
||||
|
||||
use super::{Transaction, TransactionEmailParser};
|
||||
use super::{NaiveDate, Transaction, TransactionEmailParser};
|
||||
|
||||
pub struct EmailParser;
|
||||
|
||||
// UndatedTransaction is similar to a transaction, but it does not have a date.
|
||||
struct UndatedTransaction {
|
||||
payee: String,
|
||||
amount: String,
|
||||
}
|
||||
|
||||
impl EmailParser {
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
|
@ -33,32 +40,74 @@ impl TransactionEmailParser for EmailParser {
|
|||
let html_contents = html_part
|
||||
.get_body()
|
||||
.map_err(|err| super::Error(format!("failed to find html body: {:?}", err)))?;
|
||||
// If this is malformed it's programmer error
|
||||
let ul_selector = Selector::parse("ul").expect("failed to create selector to parse email");
|
||||
let li_selector =
|
||||
Selector::parse("ul > li").expect("failed to create selector to parse email");
|
||||
let html_document = Html::parse_document(&html_contents);
|
||||
let lis = html_document
|
||||
.select(&ul_selector)
|
||||
.flat_map(|ul| ul.select(&li_selector))
|
||||
.flat_map(|li| li.text())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if lis.len() != 2 {
|
||||
return Err(super::Error(
|
||||
"email did not follow expected HTML structure; could not find list of details"
|
||||
.to_string(),
|
||||
));
|
||||
}
|
||||
let html_document = Html::parse_document(&html_contents);
|
||||
let date = extract_date(&html_document)?;
|
||||
let undated_info = extract_undated_info(&html_document)?;
|
||||
|
||||
let trans = Transaction {
|
||||
payee: lis[0].trim_start_matches("Merchant Name: ").to_string(),
|
||||
amount: format!("${}", lis[1].trim_start_matches("Amount: ")),
|
||||
payee: undated_info.payee,
|
||||
amount: undated_info.amount,
|
||||
date,
|
||||
};
|
||||
|
||||
Ok(trans)
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_date(email_html: &Html) -> Result<NaiveDate, super::Error> {
|
||||
// If these are malformed it's programmer error
|
||||
let td_selector = Selector::parse("td").expect("failed to create selector to parse email");
|
||||
let date_regexp = Regex::new(r"\d{4}-\d{2}-\d{2}").expect("failed to create regex for date");
|
||||
|
||||
// Somewhere in the email, there will be a div with the text "A purchase was made on yyyy-mm-dd, which is
|
||||
// greater than...". We can find this line and parse it out; there isn't enough specificity in the email
|
||||
// to select it manually.
|
||||
let date_line = email_html
|
||||
.select(&td_selector)
|
||||
.flat_map(|div| div.text())
|
||||
.find(|item| item.starts_with("A purchase was made on "))
|
||||
.ok_or_else(|| {
|
||||
super::Error("failed to find element containing date in email".to_string())
|
||||
})?;
|
||||
|
||||
// We cannot pass any kind of "ignore text after" to `NaiveDate::parse_from_str`, and while we could mess around
|
||||
// with splitting and such, it will be less brittle to just use a regular expression and then re-parse.
|
||||
let date_text_match = date_regexp
|
||||
.find(date_line)
|
||||
.ok_or_else(|| super::Error("failed to extract date from email text".to_string()))?;
|
||||
let date_text = &date_line[date_text_match.start()..date_text_match.end()];
|
||||
|
||||
NaiveDate::parse_from_str(date_text, "%F")
|
||||
.map_err(|err| super::Error(format!("failed to parse date from email: {:?}", err)))
|
||||
}
|
||||
|
||||
fn extract_undated_info(email_html: &Html) -> Result<UndatedTransaction, super::Error> {
|
||||
// If these are mmalformed it's a programmer rror
|
||||
let ul_selector = Selector::parse("ul").expect("failed to create selector to parse email");
|
||||
let li_selector = Selector::parse("ul > li").expect("failed to create selector to parse email");
|
||||
|
||||
let lis = email_html
|
||||
.select(&ul_selector)
|
||||
.flat_map(|ul| ul.select(&li_selector))
|
||||
.flat_map(|li| li.text())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if lis.len() != 2 {
|
||||
return Err(super::Error(
|
||||
"email did not follow expected HTML structure; could not find list of details"
|
||||
.to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let undated_trans = UndatedTransaction {
|
||||
payee: lis[0].trim_start_matches("Merchant Name: ").to_string(),
|
||||
amount: format!("${}", lis[1].trim_start_matches("Amount: ")),
|
||||
};
|
||||
|
||||
Ok(undated_trans)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
@ -74,7 +123,8 @@ mod tests {
|
|||
assert_eq!(
|
||||
Transaction {
|
||||
amount: "$0.60".to_string(),
|
||||
payee: "PAYPAL *MARKETPLACE".to_string()
|
||||
payee: "PAYPAL *MARKETPLACE".to_string(),
|
||||
date: NaiveDate::from_ymd(2022, 9, 17)
|
||||
},
|
||||
transaction
|
||||
);
|
||||
|
|
Loading…
Reference in New Issue