Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-12-10 10:23:51

0001 //========================================================================
0002 //
0003 // TextOutputDev.h
0004 //
0005 // Copyright 1997-2003 Glyph & Cog, LLC
0006 //
0007 //========================================================================
0008 
0009 //========================================================================
0010 //
0011 // Modified under the Poppler project - http://poppler.freedesktop.org
0012 //
0013 // All changes made under the Poppler project to this file are licensed
0014 // under GPL version 2 or later
0015 //
0016 // Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com>
0017 // Copyright (C) 2006 Ed Catmur <ed@catmur.co.uk>
0018 // Copyright (C) 2007, 2008, 2011, 2013 Carlos Garcia Campos <carlosgc@gnome.org>
0019 // Copyright (C) 2007, 2017 Adrian Johnson <ajohnson@redneon.com>
0020 // Copyright (C) 2008, 2010, 2015, 2016, 2018, 2019, 2021 Albert Astals Cid <aacid@kde.org>
0021 // Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com>
0022 // Copyright (C) 2012, 2013, 2015, 2016 Jason Crain <jason@aquaticape.us>
0023 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
0024 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
0025 // Copyright (C) 2018 Sanchit Anand <sanxchit@gmail.com>
0026 // Copyright (C) 2018, 2020, 2021 Nelson Benítez León <nbenitezl@gmail.com>
0027 // Copyright (C) 2019, 2022 Oliver Sander <oliver.sander@tu-dresden.de>
0028 // Copyright (C) 2019 Dan Shea <dan.shea@logical-innovations.com>
0029 // Copyright (C) 2020 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
0030 //
0031 // To see a description of the changes please see the Changelog file that
0032 // came with your tarball or type make ChangeLog if you are building from git
0033 //
0034 //========================================================================
0035 
0036 #ifndef TEXTOUTPUTDEV_H
0037 #define TEXTOUTPUTDEV_H
0038 
0039 #include "poppler-config.h"
0040 #include "poppler_private_export.h"
0041 #include <cstdio>
0042 #include "GfxFont.h"
0043 #include "GfxState.h"
0044 #include "OutputDev.h"
0045 
0046 class GooString;
0047 class Gfx;
0048 class GfxFont;
0049 class GfxState;
0050 class UnicodeMap;
0051 class AnnotLink;
0052 
0053 class TextWord;
0054 class TextPool;
0055 class TextLine;
0056 class TextLineFrag;
0057 class TextBlock;
0058 class TextFlow;
0059 class TextLink;
0060 class TextUnderline;
0061 class TextWordList;
0062 class TextPage;
0063 class TextSelectionVisitor;
0064 
0065 //------------------------------------------------------------------------
0066 
0067 typedef void (*TextOutputFunc)(void *stream, const char *text, int len);
0068 
0069 enum SelectionStyle
0070 {
0071     selectionStyleGlyph,
0072     selectionStyleWord,
0073     selectionStyleLine
0074 };
0075 
0076 enum EndOfLineKind
0077 {
0078     eolUnix, // LF
0079     eolDOS, // CR+LF
0080     eolMac // CR
0081 };
0082 
0083 //------------------------------------------------------------------------
0084 // TextFontInfo
0085 //------------------------------------------------------------------------
0086 
0087 class POPPLER_PRIVATE_EXPORT TextFontInfo
0088 {
0089 public:
0090     explicit TextFontInfo(const GfxState *state);
0091     ~TextFontInfo();
0092 
0093     TextFontInfo(const TextFontInfo &) = delete;
0094     TextFontInfo &operator=(const TextFontInfo &) = delete;
0095 
0096     bool matches(const GfxState *state) const;
0097     bool matches(const TextFontInfo *fontInfo) const;
0098     bool matches(const Ref *ref) const;
0099 
0100     // Get the font ascent, or a default value if the font is not set
0101     double getAscent() const;
0102 
0103     // Get the font descent, or a default value if the font is not set
0104     double getDescent() const;
0105 
0106     // Get the writing mode (0 or 1), or 0 if the font is not set
0107     int getWMode() const;
0108 
0109 #ifdef TEXTOUT_WORD_LIST
0110     // Get the font name (which may be NULL).
0111     const GooString *getFontName() const { return fontName; }
0112 
0113     // Get font descriptor flags.
0114     bool isFixedWidth() const { return flags & fontFixedWidth; }
0115     bool isSerif() const { return flags & fontSerif; }
0116     bool isSymbolic() const { return flags & fontSymbolic; }
0117     bool isItalic() const { return flags & fontItalic; }
0118     bool isBold() const { return flags & fontBold; }
0119 #endif
0120 
0121 private:
0122     std::shared_ptr<GfxFont> gfxFont;
0123 #ifdef TEXTOUT_WORD_LIST
0124     GooString *fontName;
0125     int flags;
0126 #endif
0127 
0128     friend class TextWord;
0129     friend class TextPage;
0130     friend class TextSelectionPainter;
0131 };
0132 
0133 //------------------------------------------------------------------------
0134 // TextWord
0135 //------------------------------------------------------------------------
0136 
0137 class POPPLER_PRIVATE_EXPORT TextWord
0138 {
0139 public:
0140     // Constructor.
0141     TextWord(const GfxState *state, int rotA, double fontSize);
0142 
0143     // Destructor.
0144     ~TextWord();
0145 
0146     TextWord(const TextWord &) = delete;
0147     TextWord &operator=(const TextWord &) = delete;
0148 
0149     // Add a character to the word.
0150     void addChar(const GfxState *state, TextFontInfo *fontA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA);
0151 
0152     // Attempt to add a character to the word as a combining character.
0153     // Either character u or the last character in the word must be an
0154     // acute, dieresis, or other combining character.  Returns true if
0155     // the character was added.
0156     bool addCombining(const GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA);
0157 
0158     // Merge <word> onto the end of <this>.
0159     void merge(TextWord *word);
0160 
0161     // Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>),
0162     // based on a primary-axis comparison, e.g., x ordering if rot=0.
0163     int primaryCmp(const TextWord *word) const;
0164 
0165     // Return the distance along the primary axis between <this> and
0166     // <word>.
0167     double primaryDelta(const TextWord *word) const;
0168 
0169     static int cmpYX(const void *p1, const void *p2);
0170 
0171     void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style);
0172 
0173     // Get the TextFontInfo object associated with a character.
0174     const TextFontInfo *getFontInfo(int idx) const { return font[idx]; }
0175 
0176     // Get the next TextWord on the linked list.
0177     const TextWord *getNext() const { return next; }
0178 
0179 #ifdef TEXTOUT_WORD_LIST
0180     int getLength() const { return len; }
0181     const Unicode *getChar(int idx) const { return &text[idx]; }
0182     GooString *getText() const;
0183     const GooString *getFontName(int idx) const { return font[idx]->fontName; }
0184     void getColor(double *r, double *g, double *b) const
0185     {
0186         *r = colorR;
0187         *g = colorG;
0188         *b = colorB;
0189     }
0190     void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const
0191     {
0192         *xMinA = xMin;
0193         *yMinA = yMin;
0194         *xMaxA = xMax;
0195         *yMaxA = yMax;
0196     }
0197     void getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const;
0198     double getFontSize() const { return fontSize; }
0199     int getRotation() const { return rot; }
0200     int getCharPos() const { return charPos[0]; }
0201     int getCharLen() const { return charPos[len] - charPos[0]; }
0202     bool getSpaceAfter() const { return spaceAfter; }
0203 #endif
0204     bool isUnderlined() const { return underlined; }
0205     const AnnotLink *getLink() const { return link; }
0206     double getEdge(int i) const { return edge[i]; }
0207     double getBaseline() const { return base; }
0208     bool hasSpaceAfter() const { return spaceAfter; }
0209     const TextWord *nextWord() const { return next; };
0210 
0211 private:
0212     void ensureCapacity(int capacity);
0213     void setInitialBounds(TextFontInfo *fontA, double x, double y);
0214 
0215     int rot; // rotation, multiple of 90 degrees
0216              //   (0, 1, 2, or 3)
0217     int wMode; // horizontal (0) or vertical (1) writing mode
0218     double xMin, xMax; // bounding box x coordinates
0219     double yMin, yMax; // bounding box y coordinates
0220     double base; // baseline x or y coordinate
0221     Unicode *text; // the text
0222     CharCode *charcode; // glyph indices
0223     double *edge; // "near" edge x or y coord of each char
0224                   //   (plus one extra entry for the last char)
0225     int *charPos; // character position (within content stream)
0226                   //   of each char (plus one extra entry for
0227                   //   the last char)
0228     int len; // length of text/edge/charPos/font arrays
0229     int size; // size of text/edge/charPos/font arrays
0230     TextFontInfo **font; // font information for each char
0231     Matrix *textMat; // transformation matrix for each char
0232     double fontSize; // font size
0233     bool spaceAfter; // set if there is a space between this
0234                      //   word and the next word on the line
0235     bool underlined;
0236     bool invisible; // whether we are invisible (glyphless)
0237     TextWord *next; // next word in line
0238 
0239 #ifdef TEXTOUT_WORD_LIST
0240     double colorR, // word color
0241             colorG, colorB;
0242 #endif
0243 
0244     AnnotLink *link;
0245 
0246     friend class TextPool;
0247     friend class TextLine;
0248     friend class TextBlock;
0249     friend class TextFlow;
0250     friend class TextWordList;
0251     friend class TextPage;
0252 
0253     friend class TextSelectionPainter;
0254     friend class TextSelectionDumper;
0255 };
0256 
0257 //------------------------------------------------------------------------
0258 // TextPool
0259 //------------------------------------------------------------------------
0260 
0261 class TextPool
0262 {
0263 public:
0264     TextPool();
0265     ~TextPool();
0266 
0267     TextPool(const TextPool &) = delete;
0268     TextPool &operator=(const TextPool &) = delete;
0269 
0270     TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; }
0271     void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; }
0272 
0273     int getBaseIdx(double base) const;
0274 
0275     void addWord(TextWord *word);
0276 
0277 private:
0278     int minBaseIdx; // min baseline bucket index
0279     int maxBaseIdx; // max baseline bucket index
0280     TextWord **pool; // array of linked lists, one for each
0281                      //   baseline value (multiple of 4 pts)
0282     TextWord *cursor; // pointer to last-accessed word
0283     int cursorBaseIdx; // baseline bucket index of last-accessed word
0284 
0285     friend class TextBlock;
0286     friend class TextPage;
0287 };
0288 
0289 struct TextFlowData;
0290 
0291 //------------------------------------------------------------------------
0292 // TextLine
0293 //------------------------------------------------------------------------
0294 
0295 class TextLine
0296 {
0297 public:
0298     TextLine(TextBlock *blkA, int rotA, double baseA);
0299     ~TextLine();
0300 
0301     TextLine(const TextLine &) = delete;
0302     TextLine &operator=(const TextLine &) = delete;
0303 
0304     void addWord(TextWord *word);
0305 
0306     // Return the distance along the primary axis between <this> and
0307     // <line>.
0308     double primaryDelta(const TextLine *line) const;
0309 
0310     // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
0311     // based on a primary-axis comparison, e.g., x ordering if rot=0.
0312     int primaryCmp(const TextLine *line) const;
0313 
0314     // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
0315     // based on a secondary-axis comparison of the baselines, e.g., y
0316     // ordering if rot=0.
0317     int secondaryCmp(const TextLine *line) const;
0318 
0319     int cmpYX(const TextLine *line) const;
0320 
0321     static int cmpXY(const void *p1, const void *p2);
0322 
0323     void coalesce(const UnicodeMap *uMap);
0324 
0325     void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style);
0326 
0327     // Get the head of the linked list of TextWords.
0328     const TextWord *getWords() const { return words; }
0329 
0330     // Get the next TextLine on the linked list.
0331     const TextLine *getNext() const { return next; }
0332 
0333     // Returns true if the last char of the line is a hyphen.
0334     bool isHyphenated() const { return hyphenated; }
0335 
0336 private:
0337     TextBlock *blk; // parent block
0338     int rot; // text rotation
0339     double xMin, xMax; // bounding box x coordinates
0340     double yMin, yMax; // bounding box y coordinates
0341     double base; // baseline x or y coordinate
0342     TextWord *words; // words in this line
0343     TextWord *lastWord; // last word in this line
0344     Unicode *text; // Unicode text of the line, including
0345                    //   spaces between words
0346     double *edge; // "near" edge x or y coord of each char
0347                   //   (plus one extra entry for the last char)
0348     int *col; // starting column number of each Unicode char
0349     int len; // number of Unicode chars
0350     int convertedLen; // total number of converted characters
0351     bool hyphenated; // set if last char is a hyphen
0352     TextLine *next; // next line in block
0353     Unicode *normalized; // normalized form of Unicode text
0354     int normalized_len; // number of normalized Unicode chars
0355     int *normalized_idx; // indices of normalized chars into Unicode text
0356     Unicode *ascii_translation; // ascii translation from the normalized text
0357     int ascii_len; // length of ascii translation text
0358     int *ascii_idx; // indices of ascii chars into Unicode text of line
0359 
0360     friend class TextLineFrag;
0361     friend class TextBlock;
0362     friend class TextFlow;
0363     friend class TextWordList;
0364     friend class TextPage;
0365 
0366     friend class TextSelectionPainter;
0367     friend class TextSelectionSizer;
0368     friend class TextSelectionDumper;
0369 };
0370 
0371 //------------------------------------------------------------------------
0372 // TextBlock
0373 //------------------------------------------------------------------------
0374 
0375 class TextBlock
0376 {
0377 public:
0378     TextBlock(TextPage *pageA, int rotA);
0379     ~TextBlock();
0380 
0381     TextBlock(const TextBlock &) = delete;
0382     TextBlock &operator=(const TextBlock &) = delete;
0383 
0384     void addWord(TextWord *word);
0385 
0386     void coalesce(const UnicodeMap *uMap, double fixedPitch);
0387 
0388     // Update this block's priMin and priMax values, looking at <blk>.
0389     void updatePriMinMax(const TextBlock *blk);
0390 
0391     static int cmpXYPrimaryRot(const void *p1, const void *p2);
0392 
0393     static int cmpYXPrimaryRot(const void *p1, const void *p2);
0394 
0395     int primaryCmp(const TextBlock *blk) const;
0396 
0397     double secondaryDelta(const TextBlock *blk) const;
0398 
0399     // Returns true if <this> is below <blk>, relative to the page's
0400     // primary rotation.
0401     bool isBelow(const TextBlock *blk) const;
0402 
0403     void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style);
0404 
0405     // Get the head of the linked list of TextLines.
0406     const TextLine *getLines() const { return lines; }
0407 
0408     // Get the next TextBlock on the linked list.
0409     const TextBlock *getNext() const { return next; }
0410 
0411     void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const
0412     {
0413         *xMinA = xMin;
0414         *yMinA = yMin;
0415         *xMaxA = xMax;
0416         *yMaxA = yMax;
0417     }
0418 
0419     int getLineCount() const { return nLines; }
0420 
0421 private:
0422     bool isBeforeByRule1(const TextBlock *blk1);
0423     bool isBeforeByRepeatedRule1(const TextBlock *blkList, const TextBlock *blk1);
0424     bool isBeforeByRule2(const TextBlock *blk1);
0425 
0426     int visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited);
0427     int visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited, TextBlock **cache, int cacheSize);
0428 
0429     TextPage *page; // the parent page
0430     int rot; // text rotation
0431     double xMin, xMax; // bounding box x coordinates
0432     double yMin, yMax; // bounding box y coordinates
0433     double priMin, priMax; // whitespace bounding box along primary axis
0434     double ExMin, ExMax; // extended bounding box x coordinates
0435     double EyMin, EyMax; // extended bounding box y coordinates
0436     int tableId; // id of table to which this block belongs
0437     bool tableEnd; // is this block at end of line of actual table
0438 
0439     TextPool *pool; // pool of words (used only until lines
0440                     //   are built)
0441     TextLine *lines; // linked list of lines
0442     TextLine *curLine; // most recently added line
0443     int nLines; // number of lines
0444     int charCount; // number of characters in the block
0445     int col; // starting column
0446     int nColumns; // number of columns in the block
0447 
0448     TextBlock *next;
0449     TextBlock *stackNext;
0450 
0451     friend class TextLine;
0452     friend class TextLineFrag;
0453     friend class TextFlow;
0454     friend class TextWordList;
0455     friend class TextPage;
0456     friend class TextSelectionPainter;
0457     friend class TextSelectionDumper;
0458 };
0459 
0460 //------------------------------------------------------------------------
0461 // TextFlow
0462 //------------------------------------------------------------------------
0463 
0464 class TextFlow
0465 {
0466 public:
0467     TextFlow(TextPage *pageA, TextBlock *blk);
0468     ~TextFlow();
0469 
0470     TextFlow(const TextFlow &) = delete;
0471     TextFlow &operator=(const TextFlow &) = delete;
0472 
0473     // Add a block to the end of this flow.
0474     void addBlock(TextBlock *blk);
0475 
0476     // Returns true if <blk> fits below <prevBlk> in the flow, i.e., (1)
0477     // it uses a font no larger than the last block added to the flow,
0478     // and (2) it fits within the flow's [priMin, priMax] along the
0479     // primary axis.
0480     bool blockFits(const TextBlock *blk, const TextBlock *prevBlk) const;
0481 
0482     // Get the head of the linked list of TextBlocks.
0483     const TextBlock *getBlocks() const { return blocks; }
0484 
0485     // Get the next TextFlow on the linked list.
0486     const TextFlow *getNext() const { return next; }
0487 
0488 private:
0489     TextPage *page; // the parent page
0490     double xMin, xMax; // bounding box x coordinates
0491     double yMin, yMax; // bounding box y coordinates
0492     double priMin, priMax; // whitespace bounding box along primary axis
0493     TextBlock *blocks; // blocks in flow
0494     TextBlock *lastBlk; // last block in this flow
0495     TextFlow *next;
0496 
0497     friend class TextWordList;
0498     friend class TextPage;
0499 };
0500 
0501 #ifdef TEXTOUT_WORD_LIST
0502 
0503 //------------------------------------------------------------------------
0504 // TextWordList
0505 //------------------------------------------------------------------------
0506 
0507 class POPPLER_PRIVATE_EXPORT TextWordList
0508 {
0509 public:
0510     // Build a flat word list, in content stream order (if
0511     // text->rawOrder is true), physical layout order (if <physLayout>
0512     // is true and text->rawOrder is false), or reading order (if both
0513     // flags are false).
0514     TextWordList(const TextPage *text, bool physLayout);
0515 
0516     ~TextWordList();
0517 
0518     TextWordList(const TextWordList &) = delete;
0519     TextWordList &operator=(const TextWordList &) = delete;
0520 
0521     // Return the number of words on the list.
0522     int getLength() const;
0523 
0524     // Return the <idx>th word from the list.
0525     TextWord *get(int idx);
0526 
0527 private:
0528     std::vector<TextWord *> words;
0529 };
0530 
0531 #endif // TEXTOUT_WORD_LIST
0532 
0533 class TextWordSelection
0534 {
0535 public:
0536     TextWordSelection(const TextWord *wordA, int beginA, int endA) : word(wordA), begin(beginA), end(endA) { }
0537 
0538     const TextWord *getWord() const { return word; }
0539     int getBegin() const { return begin; }
0540     int getEnd() const { return end; }
0541 
0542 private:
0543     const TextWord *word;
0544     int begin;
0545     int end;
0546 
0547     friend class TextSelectionPainter;
0548     friend class TextSelectionDumper;
0549 };
0550 
0551 //------------------------------------------------------------------------
0552 // TextPage
0553 //------------------------------------------------------------------------
0554 
0555 class POPPLER_PRIVATE_EXPORT TextPage
0556 {
0557 public:
0558     // Constructor.
0559     explicit TextPage(bool rawOrderA, bool discardDiagA = false);
0560 
0561     TextPage(const TextPage &) = delete;
0562     TextPage &operator=(const TextPage &) = delete;
0563 
0564     void incRefCnt();
0565     void decRefCnt();
0566 
0567     // Start a new page.
0568     void startPage(const GfxState *state);
0569 
0570     // End the current page.
0571     void endPage();
0572 
0573     // Update the current font.
0574     void updateFont(const GfxState *state);
0575 
0576     // Begin a new word.
0577     void beginWord(const GfxState *state);
0578 
0579     // Add a character to the current word.
0580     void addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen);
0581 
0582     // Add <nChars> invisible characters.
0583     void incCharCount(int nChars);
0584 
0585     // End the current word, sorting it into the list of words.
0586     void endWord();
0587 
0588     // Add a word, sorting it into the list of words.
0589     void addWord(TextWord *word);
0590 
0591     // Add a (potential) underline.
0592     void addUnderline(double x0, double y0, double x1, double y1);
0593 
0594     // Add a hyperlink.
0595     void addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link);
0596 
0597     // Coalesce strings that look like parts of the same line.
0598     void coalesce(bool physLayout, double fixedPitch, bool doHTML);
0599     void coalesce(bool physLayout, double fixedPitch, bool doHTML, double minColSpacing1);
0600 
0601     // Find a string.  If <startAtTop> is true, starts looking at the
0602     // top of the page; else if <startAtLast> is true, starts looking
0603     // immediately after the last find result; else starts looking at
0604     // <xMin>,<yMin>.  If <stopAtBottom> is true, stops looking at the
0605     // bottom of the page; else if <stopAtLast> is true, stops looking
0606     // just before the last find result; else stops looking at
0607     // <xMax>,<yMax>.
0608     bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax);
0609 
0610     // Adds new parameter ignoreDiacritics, which will do diacritics
0611     // insensitive search, i.e. ignore accents, umlauts, diaeresis,etc.
0612     // while matching. This option will be ignored if <s> contains characters
0613     // which are not pure ascii.
0614     bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax,
0615                   double *yMax);
0616 
0617     // Adds new parameter <matchAcrossLines>, which allows <s> to match on text
0618     // spanning from end of a line to the next line. In that case, the rect for
0619     // the part of match that falls on the next line will be stored in
0620     // <continueMatch>, and if hyphenation (i.e. ignoring hyphen at end of line)
0621     // was used while matching at the end of the line prior to <continueMatch>,
0622     // then <ignoredHyphen> will be true, otherwise will be false.
0623     // Only finding across two lines is supported, i.e. it won't match where <s>
0624     // spans more than two lines.
0625     //
0626     // <matchAcrossLines> will be ignored if <backward> is true (as that
0627     // combination has not been implemented yet).
0628     bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool matchAcrossLines, bool backward, bool wholeWord, double *xMin, double *yMin,
0629                   double *xMax, double *yMax, PDFRectangle *continueMatch, bool *ignoredHyphen);
0630 
0631     // Get the text which is inside the specified rectangle.
0632     GooString *getText(double xMin, double yMin, double xMax, double yMax, EndOfLineKind textEOL) const;
0633 
0634     void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style);
0635 
0636     void drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color);
0637 
0638     std::vector<PDFRectangle *> *getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale);
0639 
0640     GooString *getSelectionText(const PDFRectangle *selection, SelectionStyle style);
0641 
0642     std::vector<TextWordSelection *> **getSelectionWords(const PDFRectangle *selection, SelectionStyle style, int *nLines);
0643 
0644     // Find a string by character position and length.  If found, sets
0645     // the text bounding rectangle and returns true; otherwise returns
0646     // false.
0647     bool findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const;
0648 
0649     // Dump contents of page to a file.
0650     void dump(void *outputStream, TextOutputFunc outputFunc, bool physLayout, EndOfLineKind textEOL, bool pageBreaks);
0651 
0652     // Get the head of the linked list of TextFlows.
0653     const TextFlow *getFlows() const { return flows; }
0654 
0655     // If true, will combine characters when a base and combining
0656     // character are drawn on eachother.
0657     void setMergeCombining(bool merge);
0658 
0659 #ifdef TEXTOUT_WORD_LIST
0660     // Build a flat word list, in content stream order (if
0661     // this->rawOrder is true), physical layout order (if <physLayout>
0662     // is true and this->rawOrder is false), or reading order (if both
0663     // flags are false).
0664     std::unique_ptr<TextWordList> makeWordList(bool physLayout);
0665 #endif
0666 
0667 private:
0668     // Destructor.
0669     ~TextPage();
0670 
0671     void clear();
0672     void assignColumns(TextLineFrag *frags, int nFrags, bool rot) const;
0673     int dumpFragment(const Unicode *text, int len, const UnicodeMap *uMap, GooString *s) const;
0674     void adjustRotation(TextLine *line, int start, int end, double *xMin, double *xMax, double *yMin, double *yMax);
0675 
0676     bool rawOrder; // keep text in content stream order
0677     bool discardDiag; // discard diagonal text
0678     bool mergeCombining; // merge when combining and base characters
0679                          // are drawn on top of each other
0680 
0681     double pageWidth, pageHeight; // width and height of current page
0682     TextWord *curWord; // currently active string
0683     int charPos; // next character position (within content
0684                  //   stream)
0685     TextFontInfo *curFont; // current font
0686     double curFontSize; // current font size
0687     int nest; // current nesting level (for Type 3 fonts)
0688     int nTinyChars; // number of "tiny" chars seen so far
0689     bool lastCharOverlap; // set if the last added char overlapped the
0690                           //   previous char
0691     bool diagonal; // whether the current text is diagonal
0692 
0693     std::unique_ptr<TextPool> pools[4]; // a "pool" of TextWords for each rotation
0694     TextFlow *flows; // linked list of flows
0695     TextBlock **blocks; // array of blocks, in yx order
0696     int nBlocks; // number of blocks
0697     int primaryRot; // primary rotation
0698     bool primaryLR; // primary direction (true means L-to-R,
0699                     //   false means R-to-L)
0700     TextWord *rawWords; // list of words, in raw order (only if
0701                         //   rawOrder is set)
0702     TextWord *rawLastWord; // last word on rawWords list
0703 
0704     std::vector<std::unique_ptr<TextFontInfo>> fonts; // all font info objects used on this page
0705 
0706     double lastFindXMin, // coordinates of the last "find" result
0707             lastFindYMin;
0708     bool haveLastFind;
0709 
0710     std::vector<std::unique_ptr<TextUnderline>> underlines;
0711     std::vector<std::unique_ptr<TextLink>> links;
0712 
0713     int refCnt;
0714 
0715     friend class TextLine;
0716     friend class TextLineFrag;
0717     friend class TextBlock;
0718     friend class TextFlow;
0719     friend class TextWordList;
0720     friend class TextSelectionPainter;
0721     friend class TextSelectionDumper;
0722 };
0723 
0724 //------------------------------------------------------------------------
0725 // ActualText
0726 //------------------------------------------------------------------------
0727 
0728 class POPPLER_PRIVATE_EXPORT ActualText
0729 {
0730 public:
0731     // Create an ActualText
0732     explicit ActualText(TextPage *out);
0733     ~ActualText();
0734 
0735     ActualText(const ActualText &) = delete;
0736     ActualText &operator=(const ActualText &) = delete;
0737 
0738     void addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen);
0739     void begin(const GfxState *state, const GooString *text);
0740     void end(const GfxState *state);
0741 
0742 private:
0743     TextPage *text;
0744 
0745     GooString *actualText; // replacement text for the span
0746     double actualTextX0;
0747     double actualTextY0;
0748     double actualTextX1;
0749     double actualTextY1;
0750     int actualTextNBytes;
0751 };
0752 
0753 //------------------------------------------------------------------------
0754 // TextOutputDev
0755 //------------------------------------------------------------------------
0756 
0757 class POPPLER_PRIVATE_EXPORT TextOutputDev : public OutputDev
0758 {
0759 public:
0760     static double minColSpacing1_default;
0761 
0762     // Open a text output file.  If <fileName> is NULL, no file is
0763     // written (this is useful, e.g., for searching text).  If
0764     // <physLayoutA> is true, the original physical layout of the text
0765     // is maintained.  If <rawOrder> is true, the text is kept in
0766     // content stream order.  If <discardDiag> is true, diagonal text
0767     // is removed from output.
0768     TextOutputDev(const char *fileName, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool append, bool discardDiagA = false);
0769 
0770     // Create a TextOutputDev which will write to a generic stream.  If
0771     // <physLayoutA> is true, the original physical layout of the text
0772     // is maintained.  If <rawOrder> is true, the text is kept in
0773     // content stream order.  If <discardDiag> is true, diagonal text
0774     // is removed from output.
0775     TextOutputDev(TextOutputFunc func, void *stream, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool discardDiagA = false);
0776 
0777     // Destructor.
0778     ~TextOutputDev() override;
0779 
0780     // Check if file was successfully created.
0781     virtual bool isOk() { return ok; }
0782 
0783     //---- get info about output device
0784 
0785     // Does this device use upside-down coordinates?
0786     // (Upside-down means (0,0) is the top left corner of the page.)
0787     bool upsideDown() override { return true; }
0788 
0789     // Does this device use drawChar() or drawString()?
0790     bool useDrawChar() override { return true; }
0791 
0792     // Does this device use beginType3Char/endType3Char?  Otherwise,
0793     // text in Type 3 fonts will be drawn with drawChar/drawString.
0794     bool interpretType3Chars() override { return false; }
0795 
0796     // Does this device need non-text content?
0797     bool needNonText() override { return false; }
0798 
0799     // Does this device require incCharCount to be called for text on
0800     // non-shown layers?
0801     bool needCharCount() override { return true; }
0802 
0803     //----- initialization and control
0804 
0805     // Start a page.
0806     void startPage(int pageNum, GfxState *state, XRef *xref) override;
0807 
0808     // End a page.
0809     void endPage() override;
0810 
0811     //----- save/restore graphics state
0812     void restoreState(GfxState *state) override;
0813 
0814     //----- update text state
0815     void updateFont(GfxState *state) override;
0816 
0817     //----- text drawing
0818     void beginString(GfxState *state, const GooString *s) override;
0819     void endString(GfxState *state) override;
0820     void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, int nBytes, const Unicode *u, int uLen) override;
0821     void incCharCount(int nChars) override;
0822     void beginActualText(GfxState *state, const GooString *text) override;
0823     void endActualText(GfxState *state) override;
0824 
0825     //----- path painting
0826     void stroke(GfxState *state) override;
0827     void fill(GfxState *state) override;
0828     void eoFill(GfxState *state) override;
0829 
0830     //----- link borders
0831     void processLink(AnnotLink *link) override;
0832 
0833     //----- special access
0834 
0835     // Find a string.  If <startAtTop> is true, starts looking at the
0836     // top of the page; else if <startAtLast> is true, starts looking
0837     // immediately after the last find result; else starts looking at
0838     // <xMin>,<yMin>.  If <stopAtBottom> is true, stops looking at the
0839     // bottom of the page; else if <stopAtLast> is true, stops looking
0840     // just before the last find result; else stops looking at
0841     // <xMax>,<yMax>.
0842     bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax) const;
0843 
0844     // Get the text which is inside the specified rectangle.
0845     GooString *getText(double xMin, double yMin, double xMax, double yMax) const;
0846 
0847     // Find a string by character position and length.  If found, sets
0848     // the text bounding rectangle and returns true; otherwise returns
0849     // false.
0850     bool findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const;
0851 
0852     void drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color);
0853 
0854     std::vector<PDFRectangle *> *getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale);
0855 
0856     GooString *getSelectionText(const PDFRectangle *selection, SelectionStyle style);
0857 
0858     // If true, will combine characters when a base and combining
0859     // character are drawn on eachother.
0860     void setMergeCombining(bool merge);
0861 
0862 #ifdef TEXTOUT_WORD_LIST
0863     // Build a flat word list, in content stream order (if
0864     // this->rawOrder is true), physical layout order (if
0865     // this->physLayout is true and this->rawOrder is false), or reading
0866     // order (if both flags are false).
0867     std::unique_ptr<TextWordList> makeWordList();
0868 #endif
0869 
0870     // Returns the TextPage object for the last rasterized page,
0871     // transferring ownership to the caller.
0872     TextPage *takeText();
0873 
0874     // Turn extra processing for HTML conversion on or off.
0875     void enableHTMLExtras(bool doHTMLA) { doHTML = doHTMLA; }
0876 
0877     // Get the head of the linked list of TextFlows for the
0878     // last rasterized page.
0879     const TextFlow *getFlows() const;
0880 
0881     static constexpr EndOfLineKind defaultEndOfLine()
0882     {
0883 #if defined(_WIN32)
0884         return eolDOS;
0885 #else
0886         return eolUnix;
0887 #endif
0888     }
0889     void setTextEOL(EndOfLineKind textEOLA) { textEOL = textEOLA; }
0890     void setTextPageBreaks(bool textPageBreaksA) { textPageBreaks = textPageBreaksA; }
0891     double getMinColSpacing1() const { return minColSpacing1; }
0892     void setMinColSpacing1(double val) { minColSpacing1 = val; }
0893 
0894 private:
0895     TextOutputFunc outputFunc; // output function
0896     void *outputStream; // output stream
0897     bool needClose; // need to close the output file?
0898                     //   (only if outputStream is a FILE*)
0899     TextPage *text; // text for the current page
0900     bool physLayout; // maintain original physical layout when
0901                      //   dumping text
0902     double fixedPitch; // if physLayout is true and this is non-zero,
0903                        //   assume fixed-pitch characters with this
0904                        //   width
0905     double minColSpacing1; // see default value defined with same name at TextOutputDev.cc
0906     bool rawOrder; // keep text in content stream order
0907     bool discardDiag; // Diagonal text, i.e., text that is not close to one of the
0908                       // 0, 90, 180, or 270 degree axes, is discarded. This is useful
0909                       // to skip watermarks drawn on top of body text, etc.
0910     bool doHTML; // extra processing for HTML conversion
0911     bool ok; // set up ok?
0912     bool textPageBreaks; // insert end-of-page markers?
0913     EndOfLineKind textEOL; // type of EOL marker to use
0914 
0915     ActualText *actualText;
0916 };
0917 
0918 #endif