Add handling of binary files

Will refuse to scan binary files unless otherwise specified
master
Nick Krichevsky 2021-11-14 22:29:20 -05:00
parent 076d431f54
commit f080e6604c
5 changed files with 108 additions and 22 deletions

7
Cargo.lock generated
View File

@ -217,7 +217,6 @@ version = "0.1.1"
dependencies = [
"clap",
"grep",
"stringreader",
"termion",
"test-case",
"thiserror",
@ -382,12 +381,6 @@ dependencies = [
"serde",
]
[[package]]
name = "stringreader"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "913e7b03d63752f6cdd2df77da36749d82669904798fe8944b9ec3d23f159905"
[[package]]
name = "strsim"
version = "0.8.0"

View File

@ -16,4 +16,3 @@ clap = "2.33"
[dev-dependencies]
test-case = "1.2.1"
stringreader = "0.1"

41
src/file.rs Normal file
View File

@ -0,0 +1,41 @@
use std::io::{Error, Read};
const NOT_UTF_8_THRESHOLD: i8 = 5;
const BUFFER_CHECK_AMOUNT: usize = 255;
/// `is_file_likely_utf8` check if a file is likely filled entirely with UTF-8 characters.
/// This is useful to check if a file is likely human-readable or not.
///
/// # Errors
///
/// An [`io::Error`] will be returned if there is an underlying problem reading from the given [`Read`]
//
// This mechanism is inspired heavily by `less`' implementation, which follows the same semantics (in utf-8 mode,
// at least).
// https://github.com/gwsw/less/blob/294976950f5dc2a6b3436b1d2df97034936552b9/filename.c#L480-L484
pub fn is_file_likely_utf8<R: Read>(file: &mut R) -> Result<bool, Error> {
let mut buf: [u8; BUFFER_CHECK_AMOUNT] = [0; BUFFER_CHECK_AMOUNT];
let bytes_read = file.read(&mut buf)?;
let num_non_utf8_chars = String::from_utf8_lossy(&buf[..bytes_read])
.chars()
.filter(|c| c == &std::char::REPLACEMENT_CHARACTER)
.count();
Ok(num_non_utf8_chars <= NOT_UTF_8_THRESHOLD as usize)
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
use test_case::test_case;
#[test_case(b"hello", true; "simple string is utf-8")]
#[test_case(b"hello\xff\xffworld", true; "single non-utf-8 is ok")]
#[test_case(b"hello\xff\xffworld\xfa\xfb\xfc\xfd\xfe", false; "too many non-utf-8 is not ok")]
fn test_is_file_likely_utf8(s: &[u8], is_utf8: bool) {
let mut byte_reader = Cursor::new(s);
assert_eq!(is_utf8, is_file_likely_utf8(&mut byte_reader).unwrap());
}
}

View File

@ -7,6 +7,7 @@ use std::io;
use std::io::Read;
use thiserror::Error;
pub mod file;
mod lines;
pub mod print;
mod sink;
@ -97,7 +98,7 @@ mod tests {
use super::*;
use crate::testutil;
use std::io;
use stringreader::StringReader;
use std::io::Cursor;
use test_case::test_case;
use testutil::mock_print::MockPrinter;
@ -118,7 +119,7 @@ mod tests {
#[test]
fn test_highlights_matches() {
let mock_printer = MockPrinter::default();
let mut lipsum_reader = StringReader::new(SEARCH_TEXT);
let mut lipsum_reader = Cursor::new(SEARCH_TEXT);
let res = scan_pattern_to_printer(
&mut lipsum_reader,
r#""?computable"?\snumbers"#,
@ -157,7 +158,7 @@ mod tests {
#[test]
fn case_insensitive_pattern_matches() {
let mock_printer = MockPrinter::default();
let mut lipsum_reader = StringReader::new(SEARCH_TEXT);
let mut lipsum_reader = Cursor::new(SEARCH_TEXT);
// This test is a little bit of a cheat, because it doesn't test what's actually inputted by the CLI,
// but it does make sure the functionality works as expected
let res = scan_pattern_to_printer(&mut lipsum_reader, "(?i)INTEGRAL", &mock_printer);
@ -202,7 +203,7 @@ mod tests {
let broken_pipe_err =
print::Error::from(io::Error::new(io::ErrorKind::BrokenPipe, "broken pipe"));
mock_printer.fail_next(broken_pipe_err);
let mut lipsum_reader = StringReader::new(SEARCH_TEXT);
let mut lipsum_reader = Cursor::new(SEARCH_TEXT);
let res = scan_pattern_to_printer(&mut lipsum_reader, pattern, &mock_printer);
assert!(!res.is_err(), "failed to search: {:?}", res.unwrap_err());

View File

@ -1,15 +1,18 @@
#![warn(clippy::all, clippy::pedantic)]
use clap::{crate_name, crate_version, App, AppSettings, Arg, ArgMatches};
use hline::file;
use std::env;
use std::fmt::Display;
use std::fs::File;
use std::io;
use std::io::{Read, Stdin};
use std::io::{Read, Seek, Stdin};
use std::process;
use termion::color::{Fg, LightRed, Reset};
const FILENAME_ARG_NAME: &str = "filename";
const PATTERN_ARG_NAME: &str = "pattern";
const CASE_INSENSITIVE_ARG_NAME: &str = "case-insensitive";
const OK_IF_BINARY_ARG_NAME: &str = "ok-if-binary";
/// `OpenedFile` represents some kind of file that was opened for further handling by `hl`
enum OpenedFile {
@ -27,6 +30,7 @@ enum PassedFile {
struct Args {
pattern: String,
file: PassedFile,
ok_if_binary_file: bool,
}
impl Read for OpenedFile {
@ -42,6 +46,7 @@ impl Read for OpenedFile {
impl From<ArgMatches<'_>> for Args {
fn from(args: ArgMatches) -> Self {
let case_insensitive = args.is_present(CASE_INSENSITIVE_ARG_NAME);
let ok_if_binary_file = args.is_present(OK_IF_BINARY_ARG_NAME);
let pattern = args
.value_of(PATTERN_ARG_NAME)
.map(|pat| {
@ -59,7 +64,11 @@ impl From<ArgMatches<'_>> for Args {
PassedFile::Path(filename.to_string())
});
Args { pattern, file }
Args {
pattern,
file,
ok_if_binary_file,
}
}
}
@ -70,24 +79,32 @@ fn main() {
let args = args_parse_result.unwrap();
let open_file_result = open_file(args.file);
if let Err(err) = open_file_result {
eprintln!("Failed to open input file: {}", err);
print_error(format!("Failed to open input file: {}", err));
process::exit(2);
}
let opened_file = open_file_result.unwrap();
let mut opened_file = open_file_result.unwrap();
if !args.ok_if_binary_file {
handle_potentially_binary_file(&mut opened_file);
}
let scan_result = hline::scan_pattern(opened_file, &args.pattern);
if let Err(err) = scan_result {
// the lib crate provides the context for the errors in their error messages
eprintln!(
"{color}error:{reset} {err}",
color = Fg(LightRed),
reset = Fg(Reset),
err = err
);
print_error(err);
process::exit(3);
}
}
fn print_error<T: Display>(error_msg: T) {
eprintln!(
"{color}error:{reset} {err}",
color = Fg(LightRed),
reset = Fg(Reset),
err = error_msg
);
}
/// Setup the argument parser for the program with all possible flags
fn setup_arg_parser() -> App<'static, 'static> {
App::new(crate_name!())
@ -115,6 +132,11 @@ fn setup_arg_parser() -> App<'static, 'static> {
.long("--ignore-case")
.help("Ignore case when performing matching. If not specified, the matching is case-sensitive."),
)
.arg(
Arg::with_name(OK_IF_BINARY_ARG_NAME)
.short("-b")
.help("Treat the given input file as text, even if it may be a binary file"),
)
}
/// Open the file that was passed to the command line
@ -145,3 +167,33 @@ fn assert_is_directory(file: &File) -> Result<(), io::Error> {
fn make_pattern_case_insensitive(pattern: &str) -> String {
format!("(?i){}", pattern)
}
/// Check if the given file is a binary file, and if it is, exit gracefully
fn handle_potentially_binary_file(opened_file: &mut OpenedFile) {
let is_binary_file = match should_treat_as_binary_file(opened_file) {
Err(err) => {
// This could probably be done nicer with a macro but I don't care about a small allocation like this
// when we're immediately about to quit anyway
print_error(format!("failed to peek file: {}", err));
process::exit(4);
}
Ok(val) => val,
};
if is_binary_file {
print_error("Input file may be a binary file. Pass -b to ignore this and scan anyway.");
process::exit(5);
}
}
// Check if a given file is a binary file (or not possible to be easily checked)
fn should_treat_as_binary_file(opened_file: &mut OpenedFile) -> Result<bool, io::Error> {
match opened_file {
OpenedFile::Stdin(_) => Ok(false),
OpenedFile::File(file) => {
let is_likely_utf8 = file::is_file_likely_utf8(file)?;
file.rewind()?;
Ok(!is_likely_utf8)
}
}
}