diff --git a/CHANGELOG.md b/CHANGELOG.md index b5a12aafa..b6dd78717 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,13 @@ ## Unreleased +### Full font embedding + +Fonts can be embedded in their original form without subsetting or any other +modification. + +(Alexander Mankuta, [#1322](https://github.com/prawnpdf/prawn/pull/1322)) + ### Look for glyph in correct font Take the font style into account when looking for a glyph and fallback fonts are enabled. diff --git a/lib/prawn/font.rb b/lib/prawn/font.rb index 41281adff..4d7597463 100644 --- a/lib/prawn/font.rb +++ b/lib/prawn/font.rb @@ -145,19 +145,23 @@ def width_of(string, options = {}) end end - # Hash that maps font family names to their styled individual font names. + # Hash that maps font family names to their styled individual font + # definitions. # # To add support for another font family, append to this hash, e.g: # # pdf.font_families.update( - # "MyTrueTypeFamily" => { :bold => "foo-bold.ttf", - # :italic => "foo-italic.ttf", - # :bold_italic => "foo-bold-italic.ttf", - # :normal => "foo.ttf" }) + # "MyTrueTypeFamily" => { + # bold: "foo-bold.ttf", + # italic: "foo-italic.ttf", + # bold_italic: "foo-bold-italic.ttf", + # normal: "foo.ttf" + # } + # ) # # This will then allow you to use the fonts like so: # - # pdf.font("MyTrueTypeFamily", :style => :bold) + # pdf.font("MyTrueTypeFamily", style: :bold) # pdf.text "Some bold text" # pdf.font("MyTrueTypeFamily") # pdf.text "Some normal text" @@ -170,6 +174,17 @@ def width_of(string, options = {}) # defining your own font families, you can map any or all of these # styles to whatever font files you'd like. # + # Font definition can be either a hash or just a string. + # + # A hash font definition can specify a number of options: + # + # - :file -- path to the font file (required) + # - :subset -- whether to subset the font (default false). Only + # applicable to TrueType and OpenType fonts (includnig DFont and TTC). + # + # A string font definition is equivalent to hash definition with only + # :file being specified. + # def font_families @font_families ||= {}.merge!( 'Courier' => { @@ -339,6 +354,8 @@ def initialize(document, name, options = {}) # :nodoc: @references = {} @subset_name_cache = {} + + @full_font_embedding = options.key?(:subset) && !options[:subset] end # The size of the font ascender in PDF points @@ -401,7 +418,12 @@ def add_to_current_page(subset) end def identifier_for(subset) # :nodoc: - @subset_name_cache[subset] ||= "#{@identifier}.#{subset}".to_sym + @subset_name_cache[subset] ||= + if full_font_embedding + @identifier.to_sym + else + "#{@identifier}.#{subset}".to_sym + end end def inspect # :nodoc: @@ -426,6 +448,8 @@ def eql?(other) # :nodoc: private + attr_reader :full_font_embedding + # generate a font identifier that hasn't been used on the current page yet # def generate_unique_id diff --git a/lib/prawn/fonts/to_unicode_cmap.rb b/lib/prawn/fonts/to_unicode_cmap.rb new file mode 100644 index 000000000..e36e2ee90 --- /dev/null +++ b/lib/prawn/fonts/to_unicode_cmap.rb @@ -0,0 +1,140 @@ +# frozen_string_literal: true + +module Prawn + module Fonts + class ToUnicodeCMap # @private + # mapping is expected to be a hash with keys being charater codes (in + # broad sense, as used in the showing operation strings) and values being + # Unicode code points + def initialize(mapping, code_space_size = nil) + @mapping = mapping + @code_space_size = code_space_size + end + + def generate + chunks = [] + + # Header + chunks << <<~HEADER.chomp + /CIDInit /ProcSet findresource begin + 12 dict begin + begincmap + /CIDSystemInfo 3 dict dup begin + /Registry (Adobe) def + /Ordering (UCS) def + /Supplement 0 def + end def + /CMapName /Adobe-Identity-UCS def + /CMapType 2 def + HEADER + + max_glyph_index = mapping.keys.max + # Range + code_space_size = (max_glyph_index.bit_length / 8.0).ceil + + used_code_space_size = @code_space_size || code_space_size + + # In CMap codespaces are not sequentional, they're ranges in + # a multi-dimentional space. Each byte is considered separately. So we + # have to maximally extend the lower bytes in order to allow for + # continuos mapping. + # We only keep the highest byte because usually it's lower than + # maximally allowed and we don't want to cover that unused space. + code_space_max = max_glyph_index | ('ff' * (code_space_size - 1)).to_i(16) + + chunks << '1 begincodespacerange' + chunks << format("<%0#{used_code_space_size * 2}X><%0#{used_code_space_size * 2}X>", 0, code_space_max) + chunks << 'endcodespacerange' + + # Mapping + all_spans = + mapping_spans( + mapping.reject { |gid, cid| gid.zero? || (0xd800..0xdfff).cover?(cid) } + ) + + short_spans, long_spans = all_spans.partition { _1[0] == :short } + + long_spans + .each_slice(100) do |spans| + chunks << "#{spans.length} beginbfrange" + + spans.each do |type, span| + case type + when :fully_sorted + chunks << format( + "<%0#{code_space_size * 2}X><%0#{code_space_size * 2}X><%s>", + span.first[0], + span.last[0], + span.first[1].chr(::Encoding::UTF_16BE).unpack1('H*') + ) + when :index_sorted + chunks << format( + "<%0#{code_space_size * 2}X><%0#{code_space_size * 2}X>[%s]", + span.first[0], + span.last[0], + span.map { |_, cid| "<#{cid.chr(::Encoding::UTF_16BE).unpack1('H*')}>" }.join('') + ) + end + end + + chunks << 'endbfrange' + end + + short_spans + .map { |_type, slice| slice.flatten(1) } + .each_slice(100) do |mapping| + chunks << "#{mapping.length} beginbfchar" + chunks.concat( + mapping.map do |(gid, cid)| + format( + "<%0#{code_space_size * 2}X><%s>", + gid, + cid.chr(::Encoding::UTF_16BE).unpack1('H*') + ) + end + ) + chunks << 'endbfchar' + end + + # Footer + chunks << <<~FOOTER.chomp + endcmap + CMapName currentdict /CMap defineresource pop + end + end + FOOTER + + chunks.join("\n") + end + + private + + attr_reader :mapping + + attr_reader :cmap, :code_space_size, :code_space_max + + def mapping_spans(mapping) + mapping + .sort + .slice_when { |a, b| (b[0] - a[0]) != 1 } # Slice at key discontinuity + .flat_map do |slice| + if slice.length == 1 + [[:short, slice]] + else + continuous_clices, discontinuous_slices = + slice + .slice_when { |a, b| b[1] - a[1] != 1 } # Slice at value discontinuity + .partition { |subslice| subslice.length > 1 } + + discontinuous_slices + .flatten(1) # Join together + .slice_when { |a, b| (b[0] - a[0]) != 1 } # Slice at key discontinuity, again + .map { _1.length > 1 ? [:index_sorted, _1] : [:short, _1] } + + continuous_clices.map { [:fully_sorted, _1] } + end + end + .sort_by { _1[1][0][0] } # Sort span start key + end + end + end +end diff --git a/lib/prawn/fonts/ttf.rb b/lib/prawn/fonts/ttf.rb index 7a28a9ca5..fe0b07573 100644 --- a/lib/prawn/fonts/ttf.rb +++ b/lib/prawn/fonts/ttf.rb @@ -9,6 +9,7 @@ require 'ttfunk' require 'ttfunk/subset_collection' +require 'prawn/fonts/to_unicode_cmap' module Prawn module Fonts @@ -43,11 +44,70 @@ def unicode? true end + class FullFontSubsetsCollection + FULL_FONT = Object.new.tap do |obj| + obj.singleton_class.define_method(:inspect) do + super().insert(-2, ' FULL_FONT') + end + end.freeze + + def initialize(original) + @original = original + + (@cmap ||= original.cmap.unicode.first) || raise(NoUnicodeCMap.new(font: name)) + + @code_space_size = + case cmap.code_map.keys.max + when 0..0xff then 1 + when 0x100..0xffff then 2 + when 0x10000..0xffffff then 3 + else + 4 + end + + # Codespaces are not sequentional, they're ranges in + # a multi-dimentional space. Each byte is considered separately. So we + # have to maximally extend the lower two bytes in order to allow for + # continuos Unicode mapping. + # We only keep the highest byte because Unicode only goes to 1FFFFF + # and fonts usually cover even less of the space. We don't want to + # list all those unmapped charac codes here. + @code_space_max = cmap.code_map.keys.max | ('ff' * (code_space_size - 1)).to_i(16) + end + + def encode(characters) + [ + [ + FULL_FONT, + characters.map do |c| + check_bounds!(c) + [cmap[c]].pack('n') + end.join('') + ] + ] + end + + private + + attr_reader :cmap, :code_space_size, :code_space_max + + def check_bounds!(num) + if num > code_space_max + raise Error, "CID (#{num}) exceedes code space size" + end + end + end + def initialize(document, name, options = {}) super @ttf = read_ttf_file - @subsets = TTFunk::SubsetCollection.new(@ttf) + @subsets = + if full_font_embedding + FullFontSubsetsCollection.new(@ttf) + else + TTFunk::SubsetCollection.new(@ttf) + end @italic_angle = nil @attributes = {} @@ -200,7 +260,6 @@ def pdf_flags def normalize_encoding(text) text.encode(::Encoding::UTF_8) rescue StandardError => e - puts e raise Prawn::Errors::IncompatibleStringEncoding, "Encoding #{text.encoding} can not be transparently converted to UTF-8. " \ 'Please ensure the encoding of the string you are attempting ' \ @@ -289,12 +348,26 @@ def register(subset) end def embed(reference, subset) - font_content = @subsets[subset].encode + if full_font_embedding + embed_full_font(reference) + else + embed_subset(reference, subset) + end + end - # FIXME: we need postscript_name and glyph widths from the font - # subset. Perhaps this could be done by querying the subset, - # rather than by parsing the font that the subset produces? - font = TTFunk::File.new(font_content) + def embed_subset(reference, subset) + font = TTFunk::File.new(@subsets[subset].encode) + unicode_mapping = @subsets[subset].to_unicode_map + embed_simple_font(reference, font, unicode_mapping) + end + + def embed_simple_font(reference, font, unicode_mapping) + if font_type(font) == :unknown + raise Error, %(Simple font embedding is not uspported for font "#{font.name}.") + end + + true_type = font_type(font) == :true_type + open_type = font_type(font) == :open_type # empirically, it looks like Adobe Reader will not display fonts # if their font name is more than 33 bytes long. Strange. But true. @@ -302,14 +375,14 @@ def embed(reference, subset) raise NoPostscriptName.new(font: font) if basename.nil? - fontfile = @document.ref!(Length1: font_content.size) - fontfile.stream << font_content - fontfile.stream.compress! + fontfile = @document.ref!({}) + fontfile.data[:Length1] = font.contents.size + fontfile.stream << font.contents.string + fontfile.stream.compress! if @document.compression_enabled? descriptor = @document.ref!( Type: :FontDescriptor, FontName: basename.to_sym, - FontFile2: fontfile, FontBBox: bbox, Flags: pdf_flags, StemV: stem_v, @@ -320,10 +393,20 @@ def embed(reference, subset) XHeight: x_height ) + first_char = font.cmap.tables.first.code_map.index { |gid| !gid.zero? } + last_char = font.cmap.tables.first.code_map.rindex { |gid| !gid.zero? } hmtx = font.horizontal_metrics - widths = font.cmap.tables.first.code_map.map do |gid| - Integer(hmtx.widths[gid] * scale_factor) - end[32..] + widths = + font.cmap.tables.first.code_map[first_char..last_char].map do |gid| + if gid.zero? + # These characters are not in the document so we don't ever use + # these values but we need to encode them so let's use as little + # sapce as possible. + 0 + else + Integer(hmtx.widths[gid] * scale_factor) + end + end # It would be nice to have Encoding set for the macroman subsets, # and only do a ToUnicode cmap for non-encoded unicode subsets. @@ -335,65 +418,120 @@ def embed(reference, subset) # For now, it's simplest to just create a unicode cmap for every font. # It offends my inner purist, but it'll do. - map = @subsets[subset].to_unicode_map + to_unicode = @document.ref!({}) + to_unicode << ToUnicodeCMap.new(unicode_mapping).generate + to_unicode.stream.compress! if @document.compression_enabled? - ranges = [[]] - map.keys.sort.reduce('') do |_s, code| - ranges << [] if ranges.last.length >= 100 - unicode = map[code] - ranges.last << format( - '<%02x><%04x>', - code: code, - unicode: unicode - ) + reference.data.update( + BaseFont: basename.to_sym, + FontDescriptor: descriptor, + FirstChar: first_char, + LastChar: last_char, + Widths: @document.ref!(widths), + ToUnicode: to_unicode + ) + + if true_type + reference.data.update(Subtype: :TrueType) + descriptor.data.update(FontFile2: fontfile) + elsif open_type + @document.renderer.min_version(1.6) + reference.data.update(Subtype: :Type1) + descriptor.data.update(FontFile3: fontfile) + fontfile.data.update(Subtype: :OpenType) end + end - range_blocks = - ranges.reduce(+'') do |s, list| - s << format( - "%d beginbfchar\n%s\nendbfchar\n", - lenght: list.length, - list: list.join("\n") - ) - end + def embed_full_font(reference) + embed_composite_font(reference, @ttf) + end - to_unicode_cmap = UNICODE_CMAP_TEMPLATE % range_blocks.strip + def embed_composite_font(reference, font) + if font_type(font) == :unknown + raise Error, %(Composite font embedding is not uspported for font "#{font.name}.") + end - cmap = @document.ref!({}) - cmap << to_unicode_cmap - cmap.stream.compress! + true_type = font_type(font) == :true_type + open_type = font_type(font) == :open_type - reference.data.update( - Subtype: :TrueType, + fontfile = @document.ref!({}) + fontfile.data[:Length1] = font.contents.size if true_type + fontfile.data[:Subtype] = :CIDFontType0C if open_type + fontfile.stream << font.contents.string + fontfile.stream.compress! if @document.compression_enabled? + + # empirically, it looks like Adobe Reader will not display fonts + # if their font name is more than 33 bytes long. Strange. But true. + basename = font.name.postscript_name[0, 33].delete("\0") + + descriptor = @document.ref!( + Type: :FontDescriptor, + FontName: basename.to_sym, + FontBBox: bbox, + Flags: pdf_flags, + StemV: stem_v, + ItalicAngle: italic_angle, + Ascent: @ascender, + Descent: @descender, + CapHeight: cap_height, + XHeight: x_height + ) + descriptor.data[:FontFile2] = fontfile if true_type + descriptor.data[:FontFile3] = fontfile if open_type + + to_unicode = @document.ref!({}) + to_unicode << ToUnicodeCMap.new( + font.cmap.unicode.first + .code_map + .reject { |cid, gid| gid.zero? || (0xd800..0xdfff).cover?(cid) } + .invert + .sort.to_h, + 2 # Identity-H is a 2-byte encoding + ).generate + to_unicode.stream.compress! if @document.compression_enabled? + + widths = + font.horizontal_metrics.widths.map { |w| (w * scale_factor).round } + + child_font = @document.ref!( + Type: :Font, BaseFont: basename.to_sym, + CIDSystemInfo: { + Registry: 'Adobe', + Ordering: 'Identity', + Supplement: 0 + }, FontDescriptor: descriptor, - FirstChar: 32, - LastChar: 255, - Widths: @document.ref!(widths), - ToUnicode: cmap + W: [0, widths] + ) + if true_type + child_font.data.update( + Subtype: :CIDFontType2, + CIDToGIDMap: :Identity + ) + end + if open_type + child_font.data[:Subtype] = :CIDFontType0 + end + + reference.data.update( + Subtype: :Type0, + BaseFont: basename.to_sym, + Encoding: :'Identity-H', + DescendantFonts: [child_font], + ToUnicode: to_unicode ) end - UNICODE_CMAP_TEMPLATE = <<-STR.strip.gsub(/^\s*/, '') - /CIDInit /ProcSet findresource begin - 12 dict begin - begincmap - /CIDSystemInfo << - /Registry (Adobe) - /Ordering (UCS) - /Supplement 0 - >> def - /CMapName /Adobe-Identity-UCS def - /CMapType 2 def - 1 begincodespacerange - <00> - endcodespacerange - %s - endcmap - CMapName currentdict /CMap defineresource pop - end + def font_type(font) + if font.directory.tables.key?('glyf') + :true_type + elsif font.directory.tables.key?('CFF ') + :open_type + else + :unknown end - STR + end def read_ttf_file TTFunk::File.open(@name) diff --git a/spec/prawn/font_spec.rb b/spec/prawn/font_spec.rb index 185009c75..5211b468f 100644 --- a/spec/prawn/font_spec.rb +++ b/spec/prawn/font_spec.rb @@ -469,6 +469,52 @@ def page_should_not_include_font(font) expect(original.equal?(normalized)).to eq false end end + + describe 'full font embedding' do + let(:font) { pdf.find_font "#{Prawn::DATADIR}/fonts/DejaVuSans.ttf", subset: false } + let(:ref) { pdf.ref!({}).tap { |ref| font.__send__(:embed, ref, nil) } } + + it 'is a composite font' do + font_obj = ref.data + + expect(font_obj[:Subtype]).to eq(:Type0) + expect(font_obj[:DescendantFonts]).to be_an(Array) + expect(font_obj[:DescendantFonts].length).to eq(1) + desc_font = font_obj[:DescendantFonts].first.data + expect(desc_font[:Type]).to eq(:Font) + expect(desc_font[:Subtype]).to eq(:CIDFontType2) + end + + it 'has proper metrics' do + descriptor = ref.data[:DescendantFonts].first.data[:FontDescriptor].data + expect(descriptor[:Ascent]).to eq(759) + expect(descriptor[:Descent]).to eq(-240) + expect(descriptor[:CapHeight]).to eq(759) + end + + it 'has proper encoding' do + font_obj = ref.data + expect(font_obj[:Encoding]).to eq(:'Identity-H') + desc_font = font_obj[:DescendantFonts].first.data + expect(desc_font[:CIDToGIDMap]).to eq(:Identity) + end + + it 'contains glyph widths' do + desc_font = ref.data[:DescendantFonts].first.data + expect(desc_font[:W]).to be_an(Array) + expect(desc_font[:W].length).to eq(2) + expect(desc_font[:W][0]).to eq(0) + expect(desc_font[:W][1]).to be_an(Array) + expect(desc_font[:W][1].length).to eq(6108) # All glyph metrics + end + + it 'propely embeds font data' do + descriptor = ref.data[:DescendantFonts].first.data[:FontDescriptor].data + expect(descriptor).to have_key(:FontFile2) + expect(descriptor[:FontFile2].data[:Length1]).to eq(741_536) + expect(descriptor[:FontFile2].stream).to_not be_empty + end + end end describe 'OTF fonts' do @@ -500,6 +546,51 @@ def page_should_not_include_font(font) expect(original).to_not be_equal(normalized) end end + + describe 'full font embedding' do + let(:font) { pdf.find_font "#{Prawn::DATADIR}/fonts/Bodoni-Book.otf", subset: false } + let(:ref) { pdf.ref!({}).tap { |ref| font.__send__(:embed, ref, nil) } } + + it 'is a composite font' do + font_obj = ref.data + + expect(font_obj[:Subtype]).to eq(:Type0) + expect(font_obj[:DescendantFonts]).to be_an(Array) + expect(font_obj[:DescendantFonts].length).to eq(1) + desc_font = font_obj[:DescendantFonts].first.data + expect(desc_font[:Type]).to eq(:Font) + expect(desc_font[:Subtype]).to eq(:CIDFontType0) + end + + it 'has proper metrics' do + descriptor = ref.data[:DescendantFonts].first.data[:FontDescriptor].data + expect(descriptor[:Ascent]).to eq(1023) + expect(descriptor[:Descent]).to eq(-200) + expect(descriptor[:CapHeight]).to eq(3072) + end + + it 'has proper encoding' do + font_obj = ref.data + expect(font_obj[:Encoding]).to eq(:'Identity-H') + desc_font = font_obj[:DescendantFonts].first.data + expect(desc_font).to_not have_key(:CIDToGIDMap) + end + + it 'contains glyph widths' do + desc_font = ref.data[:DescendantFonts].first.data + expect(desc_font[:W]).to be_an(Array) + expect(desc_font[:W].length).to eq(2) + expect(desc_font[:W][0]).to eq(0) + expect(desc_font[:W][1]).to be_an(Array) + expect(desc_font[:W][1].length).to eq(353) # All glyph metrics + end + + it 'propely embeds font data' do + descriptor = ref.data[:DescendantFonts].first.data[:FontDescriptor].data + expect(descriptor).to have_key(:FontFile3) + expect(descriptor[:FontFile3].stream).to_not be_empty + end + end end describe 'DFont fonts' do