Add handling of binary files
Will refuse to scan binary files unless otherwise specifiedmaster
parent
076d431f54
commit
f080e6604c
|
@ -217,7 +217,6 @@ version = "0.1.1"
|
|||
dependencies = [
|
||||
"clap",
|
||||
"grep",
|
||||
"stringreader",
|
||||
"termion",
|
||||
"test-case",
|
||||
"thiserror",
|
||||
|
@ -382,12 +381,6 @@ dependencies = [
|
|||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stringreader"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "913e7b03d63752f6cdd2df77da36749d82669904798fe8944b9ec3d23f159905"
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.8.0"
|
||||
|
|
|
@ -16,4 +16,3 @@ clap = "2.33"
|
|||
|
||||
[dev-dependencies]
|
||||
test-case = "1.2.1"
|
||||
stringreader = "0.1"
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
use std::io::{Error, Read};
|
||||
|
||||
const NOT_UTF_8_THRESHOLD: i8 = 5;
|
||||
const BUFFER_CHECK_AMOUNT: usize = 255;
|
||||
|
||||
/// `is_file_likely_utf8` check if a file is likely filled entirely with UTF-8 characters.
|
||||
/// This is useful to check if a file is likely human-readable or not.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// An [`io::Error`] will be returned if there is an underlying problem reading from the given [`Read`]
|
||||
//
|
||||
// This mechanism is inspired heavily by `less`' implementation, which follows the same semantics (in utf-8 mode,
|
||||
// at least).
|
||||
// https://github.com/gwsw/less/blob/294976950f5dc2a6b3436b1d2df97034936552b9/filename.c#L480-L484
|
||||
pub fn is_file_likely_utf8<R: Read>(file: &mut R) -> Result<bool, Error> {
|
||||
let mut buf: [u8; BUFFER_CHECK_AMOUNT] = [0; BUFFER_CHECK_AMOUNT];
|
||||
let bytes_read = file.read(&mut buf)?;
|
||||
|
||||
let num_non_utf8_chars = String::from_utf8_lossy(&buf[..bytes_read])
|
||||
.chars()
|
||||
.filter(|c| c == &std::char::REPLACEMENT_CHARACTER)
|
||||
.count();
|
||||
|
||||
Ok(num_non_utf8_chars <= NOT_UTF_8_THRESHOLD as usize)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::io::Cursor;
|
||||
use test_case::test_case;
|
||||
|
||||
#[test_case(b"hello", true; "simple string is utf-8")]
|
||||
#[test_case(b"hello\xff\xffworld", true; "single non-utf-8 is ok")]
|
||||
#[test_case(b"hello\xff\xffworld\xfa\xfb\xfc\xfd\xfe", false; "too many non-utf-8 is not ok")]
|
||||
fn test_is_file_likely_utf8(s: &[u8], is_utf8: bool) {
|
||||
let mut byte_reader = Cursor::new(s);
|
||||
assert_eq!(is_utf8, is_file_likely_utf8(&mut byte_reader).unwrap());
|
||||
}
|
||||
}
|
|
@ -7,6 +7,7 @@ use std::io;
|
|||
use std::io::Read;
|
||||
use thiserror::Error;
|
||||
|
||||
pub mod file;
|
||||
mod lines;
|
||||
pub mod print;
|
||||
mod sink;
|
||||
|
@ -97,7 +98,7 @@ mod tests {
|
|||
use super::*;
|
||||
use crate::testutil;
|
||||
use std::io;
|
||||
use stringreader::StringReader;
|
||||
use std::io::Cursor;
|
||||
use test_case::test_case;
|
||||
use testutil::mock_print::MockPrinter;
|
||||
|
||||
|
@ -118,7 +119,7 @@ mod tests {
|
|||
#[test]
|
||||
fn test_highlights_matches() {
|
||||
let mock_printer = MockPrinter::default();
|
||||
let mut lipsum_reader = StringReader::new(SEARCH_TEXT);
|
||||
let mut lipsum_reader = Cursor::new(SEARCH_TEXT);
|
||||
let res = scan_pattern_to_printer(
|
||||
&mut lipsum_reader,
|
||||
r#""?computable"?\snumbers"#,
|
||||
|
@ -157,7 +158,7 @@ mod tests {
|
|||
#[test]
|
||||
fn case_insensitive_pattern_matches() {
|
||||
let mock_printer = MockPrinter::default();
|
||||
let mut lipsum_reader = StringReader::new(SEARCH_TEXT);
|
||||
let mut lipsum_reader = Cursor::new(SEARCH_TEXT);
|
||||
// This test is a little bit of a cheat, because it doesn't test what's actually inputted by the CLI,
|
||||
// but it does make sure the functionality works as expected
|
||||
let res = scan_pattern_to_printer(&mut lipsum_reader, "(?i)INTEGRAL", &mock_printer);
|
||||
|
@ -202,7 +203,7 @@ mod tests {
|
|||
let broken_pipe_err =
|
||||
print::Error::from(io::Error::new(io::ErrorKind::BrokenPipe, "broken pipe"));
|
||||
mock_printer.fail_next(broken_pipe_err);
|
||||
let mut lipsum_reader = StringReader::new(SEARCH_TEXT);
|
||||
let mut lipsum_reader = Cursor::new(SEARCH_TEXT);
|
||||
let res = scan_pattern_to_printer(&mut lipsum_reader, pattern, &mock_printer);
|
||||
|
||||
assert!(!res.is_err(), "failed to search: {:?}", res.unwrap_err());
|
||||
|
|
72
src/main.rs
72
src/main.rs
|
@ -1,15 +1,18 @@
|
|||
#![warn(clippy::all, clippy::pedantic)]
|
||||
use clap::{crate_name, crate_version, App, AppSettings, Arg, ArgMatches};
|
||||
use hline::file;
|
||||
use std::env;
|
||||
use std::fmt::Display;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::{Read, Stdin};
|
||||
use std::io::{Read, Seek, Stdin};
|
||||
use std::process;
|
||||
use termion::color::{Fg, LightRed, Reset};
|
||||
|
||||
const FILENAME_ARG_NAME: &str = "filename";
|
||||
const PATTERN_ARG_NAME: &str = "pattern";
|
||||
const CASE_INSENSITIVE_ARG_NAME: &str = "case-insensitive";
|
||||
const OK_IF_BINARY_ARG_NAME: &str = "ok-if-binary";
|
||||
|
||||
/// `OpenedFile` represents some kind of file that was opened for further handling by `hl`
|
||||
enum OpenedFile {
|
||||
|
@ -27,6 +30,7 @@ enum PassedFile {
|
|||
struct Args {
|
||||
pattern: String,
|
||||
file: PassedFile,
|
||||
ok_if_binary_file: bool,
|
||||
}
|
||||
|
||||
impl Read for OpenedFile {
|
||||
|
@ -42,6 +46,7 @@ impl Read for OpenedFile {
|
|||
impl From<ArgMatches<'_>> for Args {
|
||||
fn from(args: ArgMatches) -> Self {
|
||||
let case_insensitive = args.is_present(CASE_INSENSITIVE_ARG_NAME);
|
||||
let ok_if_binary_file = args.is_present(OK_IF_BINARY_ARG_NAME);
|
||||
let pattern = args
|
||||
.value_of(PATTERN_ARG_NAME)
|
||||
.map(|pat| {
|
||||
|
@ -59,7 +64,11 @@ impl From<ArgMatches<'_>> for Args {
|
|||
PassedFile::Path(filename.to_string())
|
||||
});
|
||||
|
||||
Args { pattern, file }
|
||||
Args {
|
||||
pattern,
|
||||
file,
|
||||
ok_if_binary_file,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -70,24 +79,32 @@ fn main() {
|
|||
let args = args_parse_result.unwrap();
|
||||
let open_file_result = open_file(args.file);
|
||||
if let Err(err) = open_file_result {
|
||||
eprintln!("Failed to open input file: {}", err);
|
||||
print_error(format!("Failed to open input file: {}", err));
|
||||
process::exit(2);
|
||||
}
|
||||
|
||||
let opened_file = open_file_result.unwrap();
|
||||
let mut opened_file = open_file_result.unwrap();
|
||||
if !args.ok_if_binary_file {
|
||||
handle_potentially_binary_file(&mut opened_file);
|
||||
}
|
||||
|
||||
let scan_result = hline::scan_pattern(opened_file, &args.pattern);
|
||||
if let Err(err) = scan_result {
|
||||
// the lib crate provides the context for the errors in their error messages
|
||||
eprintln!(
|
||||
"{color}error:{reset} {err}",
|
||||
color = Fg(LightRed),
|
||||
reset = Fg(Reset),
|
||||
err = err
|
||||
);
|
||||
print_error(err);
|
||||
process::exit(3);
|
||||
}
|
||||
}
|
||||
|
||||
fn print_error<T: Display>(error_msg: T) {
|
||||
eprintln!(
|
||||
"{color}error:{reset} {err}",
|
||||
color = Fg(LightRed),
|
||||
reset = Fg(Reset),
|
||||
err = error_msg
|
||||
);
|
||||
}
|
||||
|
||||
/// Setup the argument parser for the program with all possible flags
|
||||
fn setup_arg_parser() -> App<'static, 'static> {
|
||||
App::new(crate_name!())
|
||||
|
@ -115,6 +132,11 @@ fn setup_arg_parser() -> App<'static, 'static> {
|
|||
.long("--ignore-case")
|
||||
.help("Ignore case when performing matching. If not specified, the matching is case-sensitive."),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name(OK_IF_BINARY_ARG_NAME)
|
||||
.short("-b")
|
||||
.help("Treat the given input file as text, even if it may be a binary file"),
|
||||
)
|
||||
}
|
||||
|
||||
/// Open the file that was passed to the command line
|
||||
|
@ -145,3 +167,33 @@ fn assert_is_directory(file: &File) -> Result<(), io::Error> {
|
|||
fn make_pattern_case_insensitive(pattern: &str) -> String {
|
||||
format!("(?i){}", pattern)
|
||||
}
|
||||
|
||||
/// Check if the given file is a binary file, and if it is, exit gracefully
|
||||
fn handle_potentially_binary_file(opened_file: &mut OpenedFile) {
|
||||
let is_binary_file = match should_treat_as_binary_file(opened_file) {
|
||||
Err(err) => {
|
||||
// This could probably be done nicer with a macro but I don't care about a small allocation like this
|
||||
// when we're immediately about to quit anyway
|
||||
print_error(format!("failed to peek file: {}", err));
|
||||
process::exit(4);
|
||||
}
|
||||
Ok(val) => val,
|
||||
};
|
||||
|
||||
if is_binary_file {
|
||||
print_error("Input file may be a binary file. Pass -b to ignore this and scan anyway.");
|
||||
process::exit(5);
|
||||
}
|
||||
}
|
||||
|
||||
// Check if a given file is a binary file (or not possible to be easily checked)
|
||||
fn should_treat_as_binary_file(opened_file: &mut OpenedFile) -> Result<bool, io::Error> {
|
||||
match opened_file {
|
||||
OpenedFile::Stdin(_) => Ok(false),
|
||||
OpenedFile::File(file) => {
|
||||
let is_likely_utf8 = file::is_file_likely_utf8(file)?;
|
||||
file.rewind()?;
|
||||
Ok(!is_likely_utf8)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue