Skip to content

Commit

Permalink
Remove BOM from first-emitted text event
Browse files Browse the repository at this point in the history
  • Loading branch information
dralley committed Aug 16, 2022
1 parent eec31f9 commit 60e334f
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 16 deletions.
3 changes: 2 additions & 1 deletion Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,8 @@
- [#456]: Reader and writer stuff grouped under `reader` and `writer` modules.
You still can use re-exported definitions from a crate root

- [#458]: Made the `Writer::write()` method non-public as writing random bytes to a document is not generally useful.
- [#458]: Made the `Writer::write()` method non-public as writing random bytes to a document is not generally useful or desirable.
- [#458]: BOM bytes are no longer emitted as `Event::Text`. To write a BOM, use the configuration options present on `Writer`.

### New Tests

Expand Down
2 changes: 1 addition & 1 deletion src/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8],
}

#[cfg(feature = "encoding")]
fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
let (_, bytes) = split_at_bom(bytes, encoding);
bytes
}
Expand Down
34 changes: 20 additions & 14 deletions src/reader/parser.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
#[cfg(feature = "encoding")]
use encoding_rs::UTF_8;

#[cfg(feature = "encoding")]
use crate::encoding::detect_encoding;
use crate::encoding::Decoder;
use crate::encoding::{self, Decoder};
use crate::errors::{Error, Result};
use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
#[cfg(feature = "encoding")]
Expand Down Expand Up @@ -68,23 +66,31 @@ impl Parser {
///
/// [`Text`]: Event::Text
pub fn read_text<'b>(&mut self, bytes: &'b [u8], first: bool) -> Result<Event<'b>> {
#[cfg(feature = "encoding")]
if first && self.encoding.can_be_refined() {
if let Some(encoding) = detect_encoding(bytes) {
self.encoding = EncodingRef::BomDetected(encoding);
}
}
let mut content = bytes;

let content = if self.trim_text_end {
if self.trim_text_end {
// Skip the ending '<'
let len = bytes
.iter()
.rposition(|&b| !is_whitespace(b))
.map_or_else(|| bytes.len(), |p| p + 1);
&bytes[..len]
} else {
bytes
};
content = &bytes[..len];
}

if first {
#[cfg(feature = "encoding")]
if self.encoding.can_be_refined() {
if let Some(encoding) = encoding::detect_encoding(bytes) {
self.encoding = EncodingRef::BomDetected(encoding);
content = encoding::remove_bom(content, encoding);
}
}
#[cfg(not(feature = "encoding"))]
if bytes.starts_with(encoding::UTF8_BOM) {
content = &bytes[encoding::UTF8_BOM.len()..];
}
}

Ok(Event::Text(BytesText::wrap(content, self.decoder())))
}

Expand Down
2 changes: 2 additions & 0 deletions tests/encodings.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#[allow(dead_code)]
use quick_xml::events::Event;
#[allow(dead_code)]
use quick_xml::Reader;

#[cfg(feature = "encoding")]
Expand Down

0 comments on commit 60e334f

Please sign in to comment.