From 68ec7887adad160d75bb9c41e6d74e331900f2e9 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Mon, 9 Sep 2024 16:12:49 -0700 Subject: [PATCH 01/81] Start Mdoc lexer --- src/Text/Pandoc/Readers/Mdoc/Lex.hs | 331 ++++++++++++++++++++++++++++ 1 file changed, 331 insertions(+) create mode 100644 src/Text/Pandoc/Readers/Mdoc/Lex.hs diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs new file mode 100644 index 000000000000..ad380a9b973c --- /dev/null +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -0,0 +1,331 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ViewPatterns #-} +{- | + Module : Text.Pandoc.Readers.Mdoc.Lex + Copyright : Copyright (C) 2018-2020 Yan Pashkovsky and John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : Yan Pashkovsky + Stability : WIP + Portability : portable + +Tokenizer for roff formats (man, ms). +-} +module Text.Pandoc.Readers.Mdoc.Lex + ( MdocToken(..) + , MdocTokens(..) + , lexMdoc + ) +where + +import Safe (lastDef) +import Control.Monad (void, mzero, mplus, guard) +import Control.Monad.Except (throwError) +import Text.Pandoc.Class.PandocMonad + (getResourcePath, readFileFromDirs, PandocMonad(..), report) +import Data.Char (isLower, toLower, toUpper, chr, isAscii, isAlphaNum) +import Data.Default (Default) +import qualified Data.Map as M +import Data.List (intercalate) +import qualified Data.Text as T +import Text.Pandoc.Logging (LogMessage(..)) +import Text.Pandoc.Options +import Text.Pandoc.Parsing +import Text.Pandoc.Shared (safeRead) +import Text.Pandoc.RoffChar (characterCodes, combiningAccents) +import qualified Data.Sequence as Seq +import qualified Data.Foldable as Foldable +import qualified Data.Text.Normalize as Normalize + +-- import Debug.Trace (traceShowId) + +-- +-- Data Types +-- +type TableOption = (T.Text, T.Text) + + +data CellFormat = + CellFormat + { columnType :: Char + , pipePrefix :: Bool + , pipeSuffix :: Bool + , columnSuffixes :: [T.Text] + } deriving (Show, Eq, Ord) + +type TableRow = ([CellFormat], [MdocTokens]) + +data MdocToken = Str T.Text SourcePos + | Macro T.Text SourcePos + | Lit T.Text SourcePos + | Tbl [TableOption] [TableRow] SourcePos + deriving Show + +newtype MdocTokens = MdocTokens { unRoffTokens :: Seq.Seq MdocToken } + deriving (Show, Semigroup, Monoid) + +singleTok :: MdocToken -> MdocTokens +singleTok t = MdocTokens (Seq.singleton t) + +data RoffState = RoffState { tableTabChar :: Char + } deriving Show + +instance Default RoffState where + def = RoffState { tableTabChar = '\t' + } + +type RoffLexer m = ParsecT Sources RoffState m + +-- +-- Lexer: T.Text -> RoffToken +-- + +eofline :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m () +eofline = void newline <|> eof + +spacetab :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m Char +spacetab = char ' ' <|> char '\t' + +characterCodeMap :: M.Map T.Text Char +characterCodeMap = + M.fromList $ map (\(x,y) -> (y,x)) characterCodes + +combiningAccentsMap :: M.Map T.Text Char +combiningAccentsMap = + M.fromList $ map (\(x,y) -> (y,x)) combiningAccents + +escape :: PandocMonad m => RoffLexer m T.Text +escape = try $ do + backslash + escapeGlyph <|> escapeNormal + +escapeGlyph :: PandocMonad m => RoffLexer m T.Text +escapeGlyph = do + c <- lookAhead (oneOf ['[','(']) + escapeArg >>= resolveGlyph c + +resolveGlyph :: PandocMonad m => Char -> T.Text -> RoffLexer m T.Text +resolveGlyph delimChar glyph = do + let cs = T.replace "_u" " u" glyph -- unicode glyphs separated by _ + (case T.words cs of + [] -> mzero + [s] -> case M.lookup s characterCodeMap `mplus` readUnicodeChar s of + Nothing -> mzero + Just c -> return $ T.singleton c + (s:ss) -> do + basechar <- case M.lookup s characterCodeMap `mplus` + readUnicodeChar s of + Nothing -> + case T.unpack s of + [ch] | isAscii ch && isAlphaNum ch -> + return ch + _ -> mzero + Just c -> return c + let addAccents [] xs = return $ Normalize.normalize Normalize.NFC $ + T.reverse xs + addAccents (a:as) xs = + case M.lookup a combiningAccentsMap `mplus` readUnicodeChar a of + Just x -> addAccents as $ T.cons x xs + Nothing -> mzero + addAccents ss (T.singleton basechar) >>= \xs -> return xs) + <|> case delimChar of + '[' -> escUnknown ("\\[" <> glyph <> "]") + '(' -> escUnknown ("\\(" <> glyph) + '\'' -> escUnknown ("\\C'" <> glyph <> "'") + _ -> Prelude.fail "resolveGlyph: unknown glyph delimiter" + +readUnicodeChar :: T.Text -> Maybe Char +readUnicodeChar t = case T.uncons t of + Just ('u', cs) | T.length cs > 3 -> chr <$> safeRead ("0x" <> cs) + _ -> Nothing + +escapeNormal :: PandocMonad m => RoffLexer m T.Text +escapeNormal = do + c <- noneOf "{}" + let groffSkip = [escapeArg, countChar 1 (satisfy (/='\n'))] + case c of + ' ' -> return " " -- mandoc_char(7) says this should be a nonbreaking space + '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment + '#' -> mempty <$ manyTill anyChar newline + '%' -> return mempty -- optional hyphenation + '&' -> return mempty -- nonprintable zero-width + ')' -> return mempty -- nonprintable zero-width + '*' -> escIgnore '*' groffSkip + ',' -> return mempty -- to fix spacing after roman + '-' -> return "-" + '.' -> return "." + '/' -> return mempty -- to fix spacing before roman + '0' -> return "\x2007" -- digit-width space + ':' -> return mempty -- zero-width break + 'A' -> escIgnore 'A' [quoteArg] + 'B' -> escIgnore 'B' [quoteArg] + 'C' -> quoteArg >>= resolveGlyph '\'' + 'D' -> escIgnore 'D' [quoteArg] + 'F' -> escIgnore 'F' groffSkip + 'H' -> escIgnore 'H' [quoteArg] + 'L' -> escIgnore 'L' [quoteArg] + 'M' -> escIgnore 'M' groffSkip + 'N' -> escIgnore 'N' [quoteArg] + 'O' -> escIgnore 'O' groffSkip + 'R' -> escIgnore 'R' [quoteArg] + 'S' -> escIgnore 'S' [quoteArg] + 'V' -> escIgnore 'V' groffSkip + 'X' -> escIgnore 'X' [quoteArg] + 'Y' -> escIgnore 'Y' groffSkip + 'Z' -> escIgnore 'Z' [quoteArg] + '\'' -> return "'" + '\n' -> return mempty -- line continuation + '^' -> return "\x200A" -- 1/12 em space + '_' -> return "_" + '`' -> return "`" + 'a' -> return mempty -- "non-interpreted leader character" + 'b' -> escIgnore 'b' [quoteArg] + 'c' -> return mempty -- interrupt text processing + 'd' -> escIgnore 'd' [] -- forward down 1/2em + 'e' -> return "\\" + 'f' -> escIgnore 'f' groffSkip + 'g' -> escIgnore 'g' groffSkip + 'h' -> escIgnore 'h' [quoteArg] + 'k' -> escIgnore 'k' groffSkip + 'l' -> escIgnore 'l' [quoteArg] + 'm' -> escIgnore 'm' groffSkip + 'n' -> escIgnore 'm' groffSkip + 'o' -> escIgnore 'o' [quoteArg] + 'p' -> escIgnore 'p' [] + 'r' -> escIgnore 'r' [] + 's' -> escIgnore 's' [escapeArg, signedNumber] + 't' -> return "\t" + 'u' -> escIgnore 'u' [] + 'v' -> escIgnore 'v' [quoteArg] + 'w' -> escIgnore 'w' [quoteArg] + 'x' -> escIgnore 'x' [quoteArg] + 'z' -> escIgnore 'z' [countChar 1 anyChar] + '|' -> return "\x2006" --1/6 em space + '~' -> return "\160" -- nonbreaking space + '\\' -> return "\\" + _ -> return $ T.singleton c + -- man 7 groff: "If a backslash is followed by a character that + -- does not constitute a defined escape sequence, the backslash + -- is silently ignored and the character maps to itself." + +escIgnore :: PandocMonad m + => Char + -> [RoffLexer m T.Text] + -> RoffLexer m T.Text +escIgnore c argparsers = do + pos <- getPosition + arg <- snd <$> withRaw (choice argparsers) <|> return "" + report $ SkippedContent ("\\" <> T.cons c arg) pos + return mempty + +escUnknown :: PandocMonad m => T.Text -> RoffLexer m T.Text +escUnknown s = do + pos <- getPosition + report $ SkippedContent s pos + return "\xFFFD" + +signedNumber :: PandocMonad m => RoffLexer m T.Text +signedNumber = try $ do + sign <- option "" ("-" <$ char '-' <|> "" <$ char '+') + ds <- many1Char digit + return (sign <> ds) + +-- Parses: [..] or (.. +escapeArg :: PandocMonad m => RoffLexer m T.Text +escapeArg = choice + [ char '[' *> manyTillChar (noneOf ['\n',']']) (char ']') + , char '(' *> countChar 2 (satisfy (/='\n')) + ] + +-- Parses: '..' +quoteArg :: PandocMonad m => RoffLexer m T.Text +quoteArg = char '\'' *> manyTillChar (noneOf ['\n','\'']) (char '\'') + +-- separate function from lexMacro since real man files sometimes do not +-- follow the rules +lexComment :: PandocMonad m => RoffLexer m MdocTokens +lexComment = do + try $ string ".\\\"" + skipMany $ noneOf "\n" + eofline + return mempty + + +argText :: PandocMonad m => RoffLexer m T.Text +argText = mconcat <$> many1 (escape <|> regularText) + +spaceTabChar :: PandocMonad m => RoffLexer m T.Text +spaceTabChar = T.singleton <$> spacetab + +quotedArg :: PandocMonad m => RoffLexer m T.Text +quotedArg = do + quoteChar + t <- mconcat <$> many (escape <|> regularText <|> innerQuote <|> spaceTabChar) + quoteChar + notFollowedBy quoteChar + return t + where + innerQuote = do + string "\"\"" + return "\"" + +anyText :: PandocMonad m => RoffLexer m T.Text +anyText = escape <|> regularText <|> quoteChar <|> spaceTabChar + +backslash :: PandocMonad m => RoffLexer m () +backslash = + (mempty <* char '\\') <|> (mempty <* string "\\E") + +regularText :: PandocMonad m => RoffLexer m T.Text +regularText = many1Char $ noneOf "\n\r\t \\\"" + +quoteChar :: PandocMonad m => RoffLexer m T.Text +quoteChar = T.singleton <$> char '"' + +mdocToken :: PandocMonad m => RoffLexer m MdocTokens +mdocToken = lexComment <|> lexControlLine <|> lexTextLine + +lexMacro :: PandocMonad m => RoffLexer m MdocToken +lexMacro = do + pos <- getPosition + name <- many1Char (satisfy isMacroChar) + skipSpaces + return $ Macro name pos + where + isMacroChar '%' = True + isMacroChar x = isAlphaNum x + +lexLit :: PandocMonad m => RoffLexer m MdocToken +lexLit = do + pos <- getPosition + t <- mconcat <$> many (argText <|> quotedArg) + return $ Lit t pos + +lexTextLine :: PandocMonad m => RoffLexer m MdocTokens +lexTextLine = do + pos <- getPosition + guard $ sourceColumn pos == 1 + notFollowedBy $ char '.' + t <- mconcat <$> many anyText + return $ singleTok $ Str t pos + +lexControlLine :: PandocMonad m => RoffLexer m MdocTokens +lexControlLine = do + pos <- getPosition + guard $ sourceColumn pos == 1 + char '.' + m <- lexMacro + wds <- sepBy (lexLit) spacetab + eofline + return $ MdocTokens $ Seq.singleton m <> Seq.fromList wds + +-- | Tokenize a string as a sequence of roff tokens. +lexMdoc :: PandocMonad m => SourcePos -> T.Text -> m MdocTokens +lexMdoc pos txt = do + eithertokens <- readWithM (do setPosition pos + mconcat <$> many mdocToken) def txt + case eithertokens of + Left e -> throwError e + Right tokenz -> return tokenz From b4bf4f429a188e9051610fc4ccbdfeb1ca5b1197 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Tue, 10 Sep 2024 17:31:46 -0700 Subject: [PATCH 02/81] Start mdoc reader --- src/Text/Pandoc/Readers/Mdoc.hs | 232 ++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 src/Text/Pandoc/Readers/Mdoc.hs diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs new file mode 100644 index 000000000000..c19758c1f30e --- /dev/null +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -0,0 +1,232 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Mdoc + Copyright : + License : GNU GPL, version 2 or above + + Maintainer : a + Stability : WIP + Portability : portable + +Conversion of man to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Mdoc (readMdoc) where + +import Data.Default (Default) +import Control.Monad (mplus, guard, void, when) +import Control.Monad.Except (throwError) +import Data.List (intersperse) +import qualified Data.Text as T +import Text.Pandoc.Definition (Pandoc(Pandoc), Meta) +import Text.Pandoc.Builder (Blocks, Inlines) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.Class.PandocMonad (PandocMonad(..)) +import Text.Pandoc.Options +import Text.Pandoc.Parsing +import Text.Pandoc.Readers.Mdoc.Lex as Lex -- TODO explicit imports +import Text.Parsec (modifyState) +import qualified Text.Pandoc.Parsing as P +import qualified Data.Foldable as Foldable +import Text.Pandoc.Shared (stringify) + +data MdocSection + = ShName + | ShSynopsis + | ShOther + deriving (Show, Eq) + +data ManState = ManState { readerOptions :: ReaderOptions + , metadata :: Meta + , tableCellsPlain :: Bool + , progName :: Maybe T.Text + , currentSection :: MdocSection + } deriving Show + +instance Default ManState where + def = ManState { readerOptions = def + , metadata = B.nullMeta + , tableCellsPlain = True + , currentSection = ShOther + , progName = Nothing } + +type MdocParser m = P.ParsecT [MdocToken] ManState m + + +-- | Read man (troff) from an input string and return a Pandoc document. +readMdoc :: (PandocMonad m, ToSources a) + => ReaderOptions + -> a + -> m Pandoc +readMdoc opts s = do + let Sources inps = toSources s + tokenz <- mconcat <$> mapM (uncurry lexMdoc) inps + let state = def {readerOptions = opts} :: ManState + eitherdoc <- readWithMTokens parseMdoc state + (Foldable.toList . unRoffTokens $ tokenz) + either (throwError . fromParsecError (Sources inps)) return eitherdoc + + +readWithMTokens :: PandocMonad m + => ParsecT [MdocToken] ManState m a -- ^ parser + -> ManState -- ^ initial state + -> [MdocToken] -- ^ input + -> m (Either ParseError a) +readWithMTokens parser state input = + runParserT parser state "source" input + + +parseMdoc :: PandocMonad m => MdocParser m Pandoc +parseMdoc = do + optional parsePrologue + trace "hasdf" + bs <- many parseBlock <* eof + trace "DFDF" + meta <- metadata <$> getState + let (Pandoc _ blocks) = B.doc $ mconcat bs + return $ Pandoc meta blocks + +msatisfy :: Monad m + => (MdocToken -> Bool) -> P.ParsecT [MdocToken] st m MdocToken +msatisfy predic = P.tokenPrim show nextPos testTok + where + testTok t = if predic t then Just t else Nothing + nextPos _ _ (Macro _ pos':_) = pos' + nextPos _ _ (Lit _ pos':_) = pos' + nextPos _ _ (Str _ pos':_) = pos' + nextPos _ _ (Tbl _ _ pos':_) = pos' + nextPos a _ (Eol{}:x:xs) = nextPos a x xs + nextPos pos _ [Eol] = pos + nextPos pos _ [] = pos + +macro :: PandocMonad m => T.Text -> MdocParser m MdocToken +macro name = msatisfy t where + t (Macro n _) = n == name + t _ = False + +emptyMacro :: PandocMonad m => T.Text -> MdocParser m MdocToken +emptyMacro n = macro n <* eol + +str :: PandocMonad m => MdocParser m MdocToken +str = msatisfy t where + t Str{} = True + t _ = False + +lit :: PandocMonad m => MdocParser m MdocToken +lit = msatisfy t where + t Lit{} = True + t _ = False + +arg :: PandocMonad m => MdocParser m MdocToken +arg = msatisfy t where + t Lit{} = True + t Macro{} = True + t _ = False + +literal :: PandocMonad m => T.Text -> MdocParser m MdocToken +literal n = msatisfy t where + t (Lit n' _) = n == n' + t _ = False + +eol :: PandocMonad m => MdocParser m () +eol = void $ msatisfy t where + t Eol{} = True + t _ = False + +argsToInlines :: PandocMonad m => MdocParser m Inlines +argsToInlines = do + ls <- manyTill arg eol + let strs = map (B.str . toString) ls + return $ mconcat $ intersperse B.space strs + +parsePrologue :: PandocMonad m => MdocParser m () +parsePrologue = do + macro "Dd" + date <- argsToInlines + macro "Dt" + (Lit title _) <- lit + (Lit section _) <- lit + eol + emptyMacro "Os" + let adjust = B.setMeta "title" (B.str title) . B.setMeta "date" date . B.setMeta "section" (B.str section) + modifyState $ \s -> s{metadata = adjust $ metadata s} + +shToSectionMode :: T.Text -> MdocSection +shToSectionMode "NAME" = ShName +shToSectionMode "SYNOPSIS" = ShSynopsis +shToSectionMode _ = ShOther + +parseHeader :: PandocMonad m => MdocParser m Blocks +parseHeader = do + (Macro m _) <- macro "Sh" <|> macro "Ss" + txt <- argsToInlines + let lvl = if m == "Sh" then 1 else 2 + when (lvl == 1) $ modifyState $ \s -> s{currentSection = (shToSectionMode . stringify) txt} + return $ B.header lvl txt + +parseNameSection :: PandocMonad m => MdocParser m Blocks +parseNameSection = do + sec <- currentSection <$> getState + guard $ sec == ShName + macro "Nm" <|> macro "Fn" + pname <- toString <$> arg + -- TODO multiple Nm macros and delimiters + eol + macro "Nd" + desc <- argsToInlines + modifyState $ \s -> s{progName = mplus (progName s) (Just pname)} + skipMany (msatisfy (const True)) + return $ B.para $ B.code pname <> B.space <> "—" <> B.space <> desc + +parseSynopsisSection :: PandocMonad m => MdocParser m Blocks +parseSynopsisSection = do + sec <- currentSection <$> getState + guard $ sec == ShSynopsis + return mempty + +parseStr :: PandocMonad m => MdocParser m Inlines +parseStr = do + (Str txt _) <- str + return $ B.str txt + +parseLit :: PandocMonad m => MdocParser m Inlines +parseLit = do + (Lit txt _) <- lit + return $ B.str txt + +simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines +simpleInline nm xform = do + macro nm + -- inlines <- mconcat <$> many1 parseLit + return $ xform $ B.text "go nuts" + +-- Sy: callable, parsed, >0 arguments +parseSy :: PandocMonad m => MdocParser m Inlines +parseSy = trace "SSS" >> simpleInline "Sy" B.strong + +parseInlineMacro :: PandocMonad m => MdocParser m Inlines +parseInlineMacro = choice [ parseSy ] + +parseInline :: PandocMonad m => MdocParser m Inlines +parseInline = parseStr <|> (parseInlineMacro <* eol) + +parseInlines :: PandocMonad m => MdocParser m Inlines +parseInlines = mconcat <$> many1 parseInline + +parsePara :: PandocMonad m => MdocParser m Blocks +parsePara = B.para . B.trimInlines <$> parseInlines + +parseBlock :: PandocMonad m => MdocParser m Blocks +parseBlock = trace "DFDF" >> choice [ -- parseList + -- , parseDefinitionList + parseHeader + , parseNameSection + , parseSynopsisSection + , parsePara + -- , parseTable + --, parseCodeBlock + -- , parseBlockQuote + -- , parseNewParagraph + -- , skipUnknownMacro + ] + From 419ec786c4b3865246b6489937c043b956e165f4 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Tue, 10 Sep 2024 19:43:26 -0700 Subject: [PATCH 03/81] Register the mdoc reader --- src/Text/Pandoc/Readers.hs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs index ba5170a10b56..2efd061dffed 100644 --- a/src/Text/Pandoc/Readers.hs +++ b/src/Text/Pandoc/Readers.hs @@ -51,6 +51,7 @@ module Text.Pandoc.Readers , readEPUB , readMuse , readMan + , readMdoc , readFB2 , readIpynb , readCSV @@ -106,6 +107,7 @@ import Text.Pandoc.Readers.TWiki import Text.Pandoc.Readers.Txt2Tags import Text.Pandoc.Readers.Vimwiki import Text.Pandoc.Readers.Man +import Text.Pandoc.Readers.Mdoc import Text.Pandoc.Readers.CSV import Text.Pandoc.Readers.CslJson import Text.Pandoc.Readers.BibTeX @@ -168,6 +170,7 @@ readers = [("native" , TextReader readNative) ,("rtf" , TextReader readRTF) ,("typst" , TextReader readTypst) ,("djot" , TextReader readDjot) + ,("mdoc" , TextReader readMdoc) ] -- | Retrieve reader, extensions based on format spec (format+extensions). From f44074e26b5dc1f3a19e27ad77d83d11fa65a670 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Tue, 10 Sep 2024 19:45:46 -0700 Subject: [PATCH 04/81] Checkpoint I'll try to start doing real commits for myself from now on --- src/Text/Pandoc/Readers/Mdoc.hs | 26 ++++++++++++++++--- src/Text/Pandoc/Readers/Mdoc/Lex.hs | 40 +++++++++++++++++++++++------ 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index c19758c1f30e..cef5688839ec 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -94,6 +94,7 @@ msatisfy predic = P.tokenPrim show nextPos testTok nextPos _ _ (Macro _ pos':_) = pos' nextPos _ _ (Lit _ pos':_) = pos' nextPos _ _ (Str _ pos':_) = pos' + nextPos _ _ (Delim _ _ pos':_) = pos' nextPos _ _ (Tbl _ _ pos':_) = pos' nextPos a _ (Eol{}:x:xs) = nextPos a x xs nextPos pos _ [Eol] = pos @@ -107,6 +108,11 @@ macro name = msatisfy t where emptyMacro :: PandocMonad m => T.Text -> MdocParser m MdocToken emptyMacro n = macro n <* eol +delim :: PandocMonad m => DelimSide -> MdocParser m MdocToken +delim side = msatisfy t where + t (Delim s _ _) = side == s + t _ = False + str :: PandocMonad m => MdocParser m MdocToken str = msatisfy t where t Str{} = True @@ -194,15 +200,29 @@ parseLit = do (Lit txt _) <- lit return $ B.str txt +parseDelim :: PandocMonad m => DelimSide -> MdocParser m Inlines +parseDelim pos = do + (Delim _ txt _) <- delim pos + return $ B.str txt + +litsToText :: PandocMonad m => MdocParser m Inlines +litsToText = do + ls <- many lit + let strs = map (B.str . toString) ls + return $ mconcat $ intersperse B.space strs + simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines simpleInline nm xform = do macro nm - -- inlines <- mconcat <$> many1 parseLit - return $ xform $ B.text "go nuts" + openDelim <- mconcat <$> many (parseDelim Open) + inlines <- litsToText + closeDelim <- mconcat <$> many (parseDelim Close) + return $ openDelim <> xform inlines <> closeDelim + -- Sy: callable, parsed, >0 arguments parseSy :: PandocMonad m => MdocParser m Inlines -parseSy = trace "SSS" >> simpleInline "Sy" B.strong +parseSy = simpleInline "Sy" B.strong parseInlineMacro :: PandocMonad m => MdocParser m Inlines parseInlineMacro = choice [ parseSy ] diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs index ad380a9b973c..7cd8bb05274a 100644 --- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -16,7 +16,9 @@ Tokenizer for roff formats (man, ms). module Text.Pandoc.Readers.Mdoc.Lex ( MdocToken(..) , MdocTokens(..) + , DelimSide(..) , lexMdoc + , toString ) where @@ -57,12 +59,24 @@ data CellFormat = type TableRow = ([CellFormat], [MdocTokens]) +data DelimSide = Open | Middle | Close deriving (Show, Eq) + data MdocToken = Str T.Text SourcePos | Macro T.Text SourcePos | Lit T.Text SourcePos + | Delim DelimSide T.Text SourcePos | Tbl [TableOption] [TableRow] SourcePos + | Eol deriving Show +toString :: MdocToken -> T.Text +toString (Str x _) = x +toString (Macro x _) = x +toString (Lit x _) = x +toString (Delim _ x _) = x +toString Tbl{} = mempty +toString Eol = mempty + newtype MdocTokens = MdocTokens { unRoffTokens :: Seq.Seq MdocToken } deriving (Show, Semigroup, Monoid) @@ -82,8 +96,10 @@ type RoffLexer m = ParsecT Sources RoffState m -- Lexer: T.Text -> RoffToken -- -eofline :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m () -eofline = void newline <|> eof +eofline :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m MdocToken +eofline = do + void newline <|> eof + return Eol spacetab :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m Char spacetab = char ' ' <|> char '\t' @@ -297,18 +313,25 @@ lexMacro = do isMacroChar '%' = True isMacroChar x = isAlphaNum x +lexDelim :: PandocMonad m => RoffLexer m MdocToken +lexDelim = do + pos <- getPosition + t <- Delim Open <$> oneOfStrings ["(", "["] <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"] + return $ t pos + lexLit :: PandocMonad m => RoffLexer m MdocToken lexLit = do pos <- getPosition - t <- mconcat <$> many (argText <|> quotedArg) + t <- argText <|> quotedArg + guard $ not $ T.null t return $ Lit t pos lexTextLine :: PandocMonad m => RoffLexer m MdocTokens lexTextLine = do pos <- getPosition guard $ sourceColumn pos == 1 - notFollowedBy $ char '.' t <- mconcat <$> many anyText + eofline return $ singleTok $ Str t pos lexControlLine :: PandocMonad m => RoffLexer m MdocTokens @@ -317,15 +340,16 @@ lexControlLine = do guard $ sourceColumn pos == 1 char '.' m <- lexMacro - wds <- sepBy (lexLit) spacetab - eofline - return $ MdocTokens $ Seq.singleton m <> Seq.fromList wds + wds <- sepBy (lexDelim <|> lexLit) spacetab + skipSpaces + e <- eofline + return $ MdocTokens $ Seq.fromList $ (m:wds) <> [e] -- | Tokenize a string as a sequence of roff tokens. lexMdoc :: PandocMonad m => SourcePos -> T.Text -> m MdocTokens lexMdoc pos txt = do eithertokens <- readWithM (do setPosition pos - mconcat <$> many mdocToken) def txt + mconcat <$> manyTill mdocToken eof) def txt case eithertokens of Left e -> throwError e Right tokenz -> return tokenz From b301ee08692df0894d31f68d228bc75933c45f6e Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Tue, 10 Sep 2024 23:23:00 -0700 Subject: [PATCH 05/81] stash pandoc.cabal nopub --- pandoc.cabal | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandoc.cabal b/pandoc.cabal index 838ad674ec21..bd986ee84b38 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -547,6 +547,7 @@ library hs-source-dirs: src exposed-modules: Text.Pandoc, + Text.Pandoc.Readers.Roff, Text.Pandoc.App, Text.Pandoc.Data, Text.Pandoc.Options, @@ -588,6 +589,8 @@ library Text.Pandoc.Readers.EPUB, Text.Pandoc.Readers.Muse, Text.Pandoc.Readers.Man, + Text.Pandoc.Readers.Mdoc, + Text.Pandoc.Readers.Mdoc.Lex, Text.Pandoc.Readers.FB2, Text.Pandoc.Readers.DokuWiki, Text.Pandoc.Readers.Ipynb, @@ -722,7 +725,6 @@ library Text.Pandoc.Readers.Org.Parsing, Text.Pandoc.Readers.Org.Shared, Text.Pandoc.Readers.Metadata, - Text.Pandoc.Readers.Roff, Text.Pandoc.Writers.Docx.OpenXML, Text.Pandoc.Writers.Docx.StyleMap, Text.Pandoc.Writers.Docx.Table, @@ -858,3 +860,8 @@ benchmark benchmark-pandoc deepseq -- we increase heap size to avoid benchmarking garbage collection: ghc-options: -rtsopts -with-rtsopts=-A8m -threaded + +executable lexroff + import: common-executable + main-is: lexroff.hs + build-depends: pandoc, text From fe1659dae52d26decceaa3c21e97f47e6f906140 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Wed, 11 Sep 2024 12:19:05 -0700 Subject: [PATCH 06/81] Use standard spaceChar Replacing spacetab copied from Roff lexer --- src/Text/Pandoc/Readers/Mdoc/Lex.hs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs index 7cd8bb05274a..4de43db65e54 100644 --- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -101,9 +101,6 @@ eofline = do void newline <|> eof return Eol -spacetab :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m Char -spacetab = char ' ' <|> char '\t' - characterCodeMap :: M.Map T.Text Char characterCodeMap = M.fromList $ map (\(x,y) -> (y,x)) characterCodes @@ -273,7 +270,7 @@ argText :: PandocMonad m => RoffLexer m T.Text argText = mconcat <$> many1 (escape <|> regularText) spaceTabChar :: PandocMonad m => RoffLexer m T.Text -spaceTabChar = T.singleton <$> spacetab +spaceTabChar = T.singleton <$> spaceChar quotedArg :: PandocMonad m => RoffLexer m T.Text quotedArg = do From 35900e9868ae117df8a4a3c85c07b852ac3a62d7 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Wed, 11 Sep 2024 12:20:42 -0700 Subject: [PATCH 07/81] Consume spaces when lexing mdoc control tokens --- src/Text/Pandoc/Readers/Mdoc/Lex.hs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs index 4de43db65e54..9301841e24ad 100644 --- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -314,6 +314,7 @@ lexDelim :: PandocMonad m => RoffLexer m MdocToken lexDelim = do pos <- getPosition t <- Delim Open <$> oneOfStrings ["(", "["] <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"] + skipSpaces return $ t pos lexLit :: PandocMonad m => RoffLexer m MdocToken @@ -321,6 +322,7 @@ lexLit = do pos <- getPosition t <- argText <|> quotedArg guard $ not $ T.null t + skipSpaces return $ Lit t pos lexTextLine :: PandocMonad m => RoffLexer m MdocTokens @@ -337,9 +339,7 @@ lexControlLine = do guard $ sourceColumn pos == 1 char '.' m <- lexMacro - wds <- sepBy (lexDelim <|> lexLit) spacetab - skipSpaces - e <- eofline + (wds, e) <- manyUntil (lexDelim <|> lexLit) eofline return $ MdocTokens $ Seq.fromList $ (m:wds) <> [e] -- | Tokenize a string as a sequence of roff tokens. From 50383346eae1e4c9deffa03cf11cf1d3a89bdbcb Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Wed, 11 Sep 2024 12:27:10 -0700 Subject: [PATCH 08/81] Backtrack delim lexing --- src/Text/Pandoc/Readers/Mdoc/Lex.hs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs index 9301841e24ad..b82fdeb2da0c 100644 --- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -314,6 +314,7 @@ lexDelim :: PandocMonad m => RoffLexer m MdocToken lexDelim = do pos <- getPosition t <- Delim Open <$> oneOfStrings ["(", "["] <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"] + eof <|> void (lookAhead (spaceChar <|> newline)) skipSpaces return $ t pos @@ -339,7 +340,7 @@ lexControlLine = do guard $ sourceColumn pos == 1 char '.' m <- lexMacro - (wds, e) <- manyUntil (lexDelim <|> lexLit) eofline + (wds, e) <- manyUntil (try lexDelim <|> lexLit) eofline return $ MdocTokens $ Seq.fromList $ (m:wds) <> [e] -- | Tokenize a string as a sequence of roff tokens. From fd2ee1bf6d5160e5f8c6876e0293cf82edc48944 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Wed, 11 Sep 2024 14:40:00 -0700 Subject: [PATCH 09/81] Delete some traces --- src/Text/Pandoc/Readers/Mdoc.hs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index cef5688839ec..222ac633e876 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -79,9 +79,7 @@ readWithMTokens parser state input = parseMdoc :: PandocMonad m => MdocParser m Pandoc parseMdoc = do optional parsePrologue - trace "hasdf" bs <- many parseBlock <* eof - trace "DFDF" meta <- metadata <$> getState let (Pandoc _ blocks) = B.doc $ mconcat bs return $ Pandoc meta blocks @@ -237,7 +235,7 @@ parsePara :: PandocMonad m => MdocParser m Blocks parsePara = B.para . B.trimInlines <$> parseInlines parseBlock :: PandocMonad m => MdocParser m Blocks -parseBlock = trace "DFDF" >> choice [ -- parseList +parseBlock = choice [ -- parseList -- , parseDefinitionList parseHeader , parseNameSection From d88e71df308ceb0cc5a7662f738c119132993a10 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Wed, 11 Sep 2024 15:16:55 -0700 Subject: [PATCH 10/81] Remove skip to end --- src/Text/Pandoc/Readers/Mdoc.hs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 222ac633e876..b0eadd2055d3 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -179,7 +179,6 @@ parseNameSection = do macro "Nd" desc <- argsToInlines modifyState $ \s -> s{progName = mplus (progName s) (Just pname)} - skipMany (msatisfy (const True)) return $ B.para $ B.code pname <> B.space <> "—" <> B.space <> desc parseSynopsisSection :: PandocMonad m => MdocParser m Blocks From eec517811b557e006c87dfa0824b845649730f43 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Wed, 11 Sep 2024 16:59:51 -0700 Subject: [PATCH 11/81] Lex blank mdoc lines mandoc's roff(7) says "Blank text lines, which may include whitespace, are only permitted within literal contexts." mandoc -T lint warns about blank lines and inserts a roff `sp` request, which is handled differently depending on the output format. My read is that mandoc considers the handling of a blank line in non-literal context in mdoc(7) to be undefined. --- src/Text/Pandoc/Readers/Mdoc/Lex.hs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs index b82fdeb2da0c..88e7ea4421bf 100644 --- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -64,6 +64,7 @@ data DelimSide = Open | Middle | Close deriving (Show, Eq) data MdocToken = Str T.Text SourcePos | Macro T.Text SourcePos | Lit T.Text SourcePos + | Blank SourcePos | Delim DelimSide T.Text SourcePos | Tbl [TableOption] [TableRow] SourcePos | Eol @@ -74,6 +75,7 @@ toString (Str x _) = x toString (Macro x _) = x toString (Lit x _) = x toString (Delim _ x _) = x +toString Blank{} = mempty toString Tbl{} = mempty toString Eol = mempty @@ -332,7 +334,9 @@ lexTextLine = do guard $ sourceColumn pos == 1 t <- mconcat <$> many anyText eofline - return $ singleTok $ Str t pos + if T.null $ T.strip t + then return $ singleTok $ Blank pos + else return $ singleTok $ Str t pos lexControlLine :: PandocMonad m => RoffLexer m MdocTokens lexControlLine = do From f4383b950a1e2669990cab1071bc06cf826b4865 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Wed, 11 Sep 2024 17:14:34 -0700 Subject: [PATCH 12/81] Handle blanks in mdoc parser --- src/Text/Pandoc/Readers/Mdoc.hs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index b0eadd2055d3..d2cd392204bf 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -93,6 +93,7 @@ msatisfy predic = P.tokenPrim show nextPos testTok nextPos _ _ (Lit _ pos':_) = pos' nextPos _ _ (Str _ pos':_) = pos' nextPos _ _ (Delim _ _ pos':_) = pos' + nextPos _ _ (Blank pos':_) = pos' nextPos _ _ (Tbl _ _ pos':_) = pos' nextPos a _ (Eol{}:x:xs) = nextPos a x xs nextPos pos _ [Eol] = pos @@ -132,6 +133,11 @@ literal n = msatisfy t where t (Lit n' _) = n == n' t _ = False +blank :: PandocMonad m => MdocParser m MdocToken +blank = msatisfy t where + t Blank{} = True + t _ = False + eol :: PandocMonad m => MdocParser m () eol = void $ msatisfy t where t Eol{} = True @@ -233,6 +239,9 @@ parseInlines = mconcat <$> many1 parseInline parsePara :: PandocMonad m => MdocParser m Blocks parsePara = B.para . B.trimInlines <$> parseInlines +skipBlanks :: PandocMonad m => MdocParser m Blocks +skipBlanks = many1 blank *> mempty + parseBlock :: PandocMonad m => MdocParser m Blocks parseBlock = choice [ -- parseList -- , parseDefinitionList @@ -242,6 +251,7 @@ parseBlock = choice [ -- parseList , parsePara -- , parseTable --, parseCodeBlock + , skipBlanks -- , parseBlockQuote -- , parseNewParagraph -- , skipUnknownMacro From 353943cc46ef28d103b3f4c183e261d8803f1c12 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Wed, 11 Sep 2024 17:18:45 -0700 Subject: [PATCH 13/81] Comment upon parseStr's output --- src/Text/Pandoc/Readers/Mdoc.hs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index d2cd392204bf..94487e215ac5 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -193,6 +193,9 @@ parseSynopsisSection = do guard $ sec == ShSynopsis return mempty +-- parseStr doesn't use B.text because roff(7) specifies that +-- whitespace in text lines is treated literally. +-- XXX is this what we actually want? parseStr :: PandocMonad m => MdocParser m Inlines parseStr = do (Str txt _) <- str From 5a56e5d076dc7f03082d3443ac41791fc4673726 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Wed, 11 Sep 2024 17:37:47 -0700 Subject: [PATCH 14/81] Remove unused/redundant imports Copy-pasted. Maybe they'll come back. --- src/Text/Pandoc/Readers/Mdoc/Lex.hs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs index 88e7ea4421bf..4eacfc391e74 100644 --- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -1,7 +1,6 @@ {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE OverloadedStrings #-} -{-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Readers.Mdoc.Lex Copyright : Copyright (C) 2018-2020 Yan Pashkovsky and John MacFarlane @@ -22,15 +21,12 @@ module Text.Pandoc.Readers.Mdoc.Lex ) where -import Safe (lastDef) import Control.Monad (void, mzero, mplus, guard) import Control.Monad.Except (throwError) -import Text.Pandoc.Class.PandocMonad - (getResourcePath, readFileFromDirs, PandocMonad(..), report) -import Data.Char (isLower, toLower, toUpper, chr, isAscii, isAlphaNum) +import Text.Pandoc.Class.PandocMonad (PandocMonad(..), report) +import Data.Char (chr, isAscii, isAlphaNum) import Data.Default (Default) import qualified Data.Map as M -import Data.List (intercalate) import qualified Data.Text as T import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Options @@ -38,7 +34,6 @@ import Text.Pandoc.Parsing import Text.Pandoc.Shared (safeRead) import Text.Pandoc.RoffChar (characterCodes, combiningAccents) import qualified Data.Sequence as Seq -import qualified Data.Foldable as Foldable import qualified Data.Text.Normalize as Normalize -- import Debug.Trace (traceShowId) From 754ffc85f5b433fc3b8f5effdc6d416610580f84 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Wed, 11 Sep 2024 19:40:43 -0700 Subject: [PATCH 15/81] Make more progress on inlines/paras --- src/Text/Pandoc/Readers/Mdoc.hs | 41 ++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 94487e215ac5..2bfd6f3f136d 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -143,6 +143,16 @@ eol = void $ msatisfy t where t Eol{} = True t _ = False +inlineContextEnd :: PandocMonad m => MdocParser m () +inlineContextEnd = eof <|> (void . lookAhead $ msatisfy t) where + t Eol{} = True + t Macro{} = True + t Str{} = True -- shouldn't be lexed + t Tbl{} = True -- shouldn't be lexed + t Blank{} = True -- shouldn't be lexed + t Lit{} = False + t Delim{} = False + argsToInlines :: PandocMonad m => MdocParser m Inlines argsToInlines = do ls <- manyTill arg eol @@ -213,34 +223,49 @@ parseDelim pos = do litsToText :: PandocMonad m => MdocParser m Inlines litsToText = do - ls <- many lit + ls <- many1 lit let strs = map (B.str . toString) ls return $ mconcat $ intersperse B.space strs simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines simpleInline nm xform = do macro nm - openDelim <- mconcat <$> many (parseDelim Open) - inlines <- litsToText - closeDelim <- mconcat <$> many (parseDelim Close) - return $ openDelim <> xform inlines <> closeDelim + segs <- manyTill segment inlineContextEnd + return $ mconcat $ intersperse B.space segs + where + segment = do + openDelim <- mconcat <$> many (parseDelim Open) + inlines <- option mempty litsToText + closeDelim <- mconcat <$> many (parseDelim Close) + let xform' x = if null x then mempty else xform x + return $ openDelim <> xform' inlines <> closeDelim -- Sy: callable, parsed, >0 arguments +-- mandoc -T html formats Sy with a tag, since it's not really +-- semantically , but Strong is our best option in Pandoc parseSy :: PandocMonad m => MdocParser m Inlines parseSy = simpleInline "Sy" B.strong +parseEm :: PandocMonad m => MdocParser m Inlines +parseEm = simpleInline "Em" B.emph + parseInlineMacro :: PandocMonad m => MdocParser m Inlines -parseInlineMacro = choice [ parseSy ] +parseInlineMacro = choice [ parseSy, parseEm ] +-- TODO this doesn't handle inline macros being interrupted +-- by other ones yet, but the lexer doesn't handle it yet +-- either parseInline :: PandocMonad m => MdocParser m Inlines parseInline = parseStr <|> (parseInlineMacro <* eol) parseInlines :: PandocMonad m => MdocParser m Inlines -parseInlines = mconcat <$> many1 parseInline +parseInlines = mconcat . intersperse B.space <$> many1 parseInline parsePara :: PandocMonad m => MdocParser m Blocks -parsePara = B.para . B.trimInlines <$> parseInlines +parsePara = do + optional (emptyMacro "Pp") + B.para . B.trimInlines <$> parseInlines skipBlanks :: PandocMonad m => MdocParser m Blocks skipBlanks = many1 blank *> mempty From e1eddf630dfe8cf147fec4ba77f3974bfd02c129 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Wed, 11 Sep 2024 19:42:53 -0700 Subject: [PATCH 16/81] Add parsing of code blocks from mdoc --- src/Text/Pandoc/Readers/Mdoc.hs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 2bfd6f3f136d..218130c9934d 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -267,6 +267,22 @@ parsePara = do optional (emptyMacro "Pp") B.para . B.trimInlines <$> parseInlines +-- CodeBlocks can't contain any other markup, but mdoc +-- still interprets control lines within .Bd -literal +-- blocks. Just ignoring this for now and failing if +-- we get any control lines inside a Bd literal +parseCodeBlock :: PandocMonad m => MdocParser m Blocks +parseCodeBlock = do + macro "Bd" -- TODO will need to hoist + literal "-literal" + optional (literal "-offset" *> lit) + optional (literal "-compact") + eol + l <- T.unlines . map toString <$> many (str <|> blank) + emptyMacro "Ed" + return $ B.codeBlock l + + skipBlanks :: PandocMonad m => MdocParser m Blocks skipBlanks = many1 blank *> mempty @@ -278,7 +294,7 @@ parseBlock = choice [ -- parseList , parseSynopsisSection , parsePara -- , parseTable - --, parseCodeBlock + , parseCodeBlock , skipBlanks -- , parseBlockQuote -- , parseNewParagraph From 9e44bb6d40d1cf3ad25d2d4d7d460a33365783b4 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 12 Sep 2024 00:40:51 -0700 Subject: [PATCH 17/81] tktk: Unify roff escapes --- pandoc.cabal | 1 + src/Text/Pandoc/Readers/Mdoc/Lex.hs | 215 +++----------------- src/Text/Pandoc/Readers/Roff.hs | 271 +++++-------------------- src/Text/Pandoc/Readers/Roff/Escape.hs | 198 ++++++++++++++++++ 4 files changed, 280 insertions(+), 405 deletions(-) create mode 100644 src/Text/Pandoc/Readers/Roff/Escape.hs diff --git a/pandoc.cabal b/pandoc.cabal index bd986ee84b38..b885c3db52f1 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -724,6 +724,7 @@ library Text.Pandoc.Readers.Org.ParserState, Text.Pandoc.Readers.Org.Parsing, Text.Pandoc.Readers.Org.Shared, + Text.Pandoc.Readers.Roff.Escape, Text.Pandoc.Readers.Metadata, Text.Pandoc.Writers.Docx.OpenXML, Text.Pandoc.Writers.Docx.StyleMap, diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs index 4eacfc391e74..03e3f999aa35 100644 --- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -1,6 +1,9 @@ {-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE TypeFamilies #-} {- | Module : Text.Pandoc.Readers.Mdoc.Lex Copyright : Copyright (C) 2018-2020 Yan Pashkovsky and John MacFarlane @@ -21,20 +24,16 @@ module Text.Pandoc.Readers.Mdoc.Lex ) where -import Control.Monad (void, mzero, mplus, guard) +import Control.Monad (void, guard) import Control.Monad.Except (throwError) -import Text.Pandoc.Class.PandocMonad (PandocMonad(..), report) -import Data.Char (chr, isAscii, isAlphaNum) +import Text.Pandoc.Class.PandocMonad (PandocMonad(..)) +import Data.Char (isAlphaNum) import Data.Default (Default) -import qualified Data.Map as M import qualified Data.Text as T -import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Options import Text.Pandoc.Parsing -import Text.Pandoc.Shared (safeRead) -import Text.Pandoc.RoffChar (characterCodes, combiningAccents) +import Text.Pandoc.Readers.Roff.Escape import qualified Data.Sequence as Seq -import qualified Data.Text.Normalize as Normalize -- import Debug.Trace (traceShowId) @@ -80,14 +79,20 @@ newtype MdocTokens = MdocTokens { unRoffTokens :: Seq.Seq MdocToken } singleTok :: MdocToken -> MdocTokens singleTok t = MdocTokens (Seq.singleton t) -data RoffState = RoffState { tableTabChar :: Char - } deriving Show +data RoffState = RoffState deriving Show instance Default RoffState where - def = RoffState { tableTabChar = '\t' - } + def = RoffState -type RoffLexer m = ParsecT Sources RoffState m +type Lexer m = ParsecT Sources RoffState m + +instance RoffMonad MdocTokens where + type Token MdocTokens = T.Text + type State MdocTokens = RoffState + expandString = return () + escString = return mempty + emit = id + backslash = (mempty <* char '\\') <|> (mempty <* string "\\E") -- -- Lexer: T.Text -> RoffToken @@ -98,164 +103,10 @@ eofline = do void newline <|> eof return Eol -characterCodeMap :: M.Map T.Text Char -characterCodeMap = - M.fromList $ map (\(x,y) -> (y,x)) characterCodes - -combiningAccentsMap :: M.Map T.Text Char -combiningAccentsMap = - M.fromList $ map (\(x,y) -> (y,x)) combiningAccents - -escape :: PandocMonad m => RoffLexer m T.Text -escape = try $ do - backslash - escapeGlyph <|> escapeNormal - -escapeGlyph :: PandocMonad m => RoffLexer m T.Text -escapeGlyph = do - c <- lookAhead (oneOf ['[','(']) - escapeArg >>= resolveGlyph c - -resolveGlyph :: PandocMonad m => Char -> T.Text -> RoffLexer m T.Text -resolveGlyph delimChar glyph = do - let cs = T.replace "_u" " u" glyph -- unicode glyphs separated by _ - (case T.words cs of - [] -> mzero - [s] -> case M.lookup s characterCodeMap `mplus` readUnicodeChar s of - Nothing -> mzero - Just c -> return $ T.singleton c - (s:ss) -> do - basechar <- case M.lookup s characterCodeMap `mplus` - readUnicodeChar s of - Nothing -> - case T.unpack s of - [ch] | isAscii ch && isAlphaNum ch -> - return ch - _ -> mzero - Just c -> return c - let addAccents [] xs = return $ Normalize.normalize Normalize.NFC $ - T.reverse xs - addAccents (a:as) xs = - case M.lookup a combiningAccentsMap `mplus` readUnicodeChar a of - Just x -> addAccents as $ T.cons x xs - Nothing -> mzero - addAccents ss (T.singleton basechar) >>= \xs -> return xs) - <|> case delimChar of - '[' -> escUnknown ("\\[" <> glyph <> "]") - '(' -> escUnknown ("\\(" <> glyph) - '\'' -> escUnknown ("\\C'" <> glyph <> "'") - _ -> Prelude.fail "resolveGlyph: unknown glyph delimiter" - -readUnicodeChar :: T.Text -> Maybe Char -readUnicodeChar t = case T.uncons t of - Just ('u', cs) | T.length cs > 3 -> chr <$> safeRead ("0x" <> cs) - _ -> Nothing - -escapeNormal :: PandocMonad m => RoffLexer m T.Text -escapeNormal = do - c <- noneOf "{}" - let groffSkip = [escapeArg, countChar 1 (satisfy (/='\n'))] - case c of - ' ' -> return " " -- mandoc_char(7) says this should be a nonbreaking space - '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment - '#' -> mempty <$ manyTill anyChar newline - '%' -> return mempty -- optional hyphenation - '&' -> return mempty -- nonprintable zero-width - ')' -> return mempty -- nonprintable zero-width - '*' -> escIgnore '*' groffSkip - ',' -> return mempty -- to fix spacing after roman - '-' -> return "-" - '.' -> return "." - '/' -> return mempty -- to fix spacing before roman - '0' -> return "\x2007" -- digit-width space - ':' -> return mempty -- zero-width break - 'A' -> escIgnore 'A' [quoteArg] - 'B' -> escIgnore 'B' [quoteArg] - 'C' -> quoteArg >>= resolveGlyph '\'' - 'D' -> escIgnore 'D' [quoteArg] - 'F' -> escIgnore 'F' groffSkip - 'H' -> escIgnore 'H' [quoteArg] - 'L' -> escIgnore 'L' [quoteArg] - 'M' -> escIgnore 'M' groffSkip - 'N' -> escIgnore 'N' [quoteArg] - 'O' -> escIgnore 'O' groffSkip - 'R' -> escIgnore 'R' [quoteArg] - 'S' -> escIgnore 'S' [quoteArg] - 'V' -> escIgnore 'V' groffSkip - 'X' -> escIgnore 'X' [quoteArg] - 'Y' -> escIgnore 'Y' groffSkip - 'Z' -> escIgnore 'Z' [quoteArg] - '\'' -> return "'" - '\n' -> return mempty -- line continuation - '^' -> return "\x200A" -- 1/12 em space - '_' -> return "_" - '`' -> return "`" - 'a' -> return mempty -- "non-interpreted leader character" - 'b' -> escIgnore 'b' [quoteArg] - 'c' -> return mempty -- interrupt text processing - 'd' -> escIgnore 'd' [] -- forward down 1/2em - 'e' -> return "\\" - 'f' -> escIgnore 'f' groffSkip - 'g' -> escIgnore 'g' groffSkip - 'h' -> escIgnore 'h' [quoteArg] - 'k' -> escIgnore 'k' groffSkip - 'l' -> escIgnore 'l' [quoteArg] - 'm' -> escIgnore 'm' groffSkip - 'n' -> escIgnore 'm' groffSkip - 'o' -> escIgnore 'o' [quoteArg] - 'p' -> escIgnore 'p' [] - 'r' -> escIgnore 'r' [] - 's' -> escIgnore 's' [escapeArg, signedNumber] - 't' -> return "\t" - 'u' -> escIgnore 'u' [] - 'v' -> escIgnore 'v' [quoteArg] - 'w' -> escIgnore 'w' [quoteArg] - 'x' -> escIgnore 'x' [quoteArg] - 'z' -> escIgnore 'z' [countChar 1 anyChar] - '|' -> return "\x2006" --1/6 em space - '~' -> return "\160" -- nonbreaking space - '\\' -> return "\\" - _ -> return $ T.singleton c - -- man 7 groff: "If a backslash is followed by a character that - -- does not constitute a defined escape sequence, the backslash - -- is silently ignored and the character maps to itself." - -escIgnore :: PandocMonad m - => Char - -> [RoffLexer m T.Text] - -> RoffLexer m T.Text -escIgnore c argparsers = do - pos <- getPosition - arg <- snd <$> withRaw (choice argparsers) <|> return "" - report $ SkippedContent ("\\" <> T.cons c arg) pos - return mempty - -escUnknown :: PandocMonad m => T.Text -> RoffLexer m T.Text -escUnknown s = do - pos <- getPosition - report $ SkippedContent s pos - return "\xFFFD" - -signedNumber :: PandocMonad m => RoffLexer m T.Text -signedNumber = try $ do - sign <- option "" ("-" <$ char '-' <|> "" <$ char '+') - ds <- many1Char digit - return (sign <> ds) - --- Parses: [..] or (.. -escapeArg :: PandocMonad m => RoffLexer m T.Text -escapeArg = choice - [ char '[' *> manyTillChar (noneOf ['\n',']']) (char ']') - , char '(' *> countChar 2 (satisfy (/='\n')) - ] - --- Parses: '..' -quoteArg :: PandocMonad m => RoffLexer m T.Text -quoteArg = char '\'' *> manyTillChar (noneOf ['\n','\'']) (char '\'') -- separate function from lexMacro since real man files sometimes do not -- follow the rules -lexComment :: PandocMonad m => RoffLexer m MdocTokens +lexComment :: PandocMonad m => Lexer m MdocTokens lexComment = do try $ string ".\\\"" skipMany $ noneOf "\n" @@ -263,13 +114,13 @@ lexComment = do return mempty -argText :: PandocMonad m => RoffLexer m T.Text +argText :: PandocMonad m => Lexer m T.Text argText = mconcat <$> many1 (escape <|> regularText) -spaceTabChar :: PandocMonad m => RoffLexer m T.Text +spaceTabChar :: PandocMonad m => Lexer m T.Text spaceTabChar = T.singleton <$> spaceChar -quotedArg :: PandocMonad m => RoffLexer m T.Text +quotedArg :: PandocMonad m => Lexer m T.Text quotedArg = do quoteChar t <- mconcat <$> many (escape <|> regularText <|> innerQuote <|> spaceTabChar) @@ -281,23 +132,19 @@ quotedArg = do string "\"\"" return "\"" -anyText :: PandocMonad m => RoffLexer m T.Text +anyText :: PandocMonad m => Lexer m T.Text anyText = escape <|> regularText <|> quoteChar <|> spaceTabChar -backslash :: PandocMonad m => RoffLexer m () -backslash = - (mempty <* char '\\') <|> (mempty <* string "\\E") - -regularText :: PandocMonad m => RoffLexer m T.Text +regularText :: PandocMonad m => Lexer m T.Text regularText = many1Char $ noneOf "\n\r\t \\\"" -quoteChar :: PandocMonad m => RoffLexer m T.Text +quoteChar :: PandocMonad m => Lexer m T.Text quoteChar = T.singleton <$> char '"' -mdocToken :: PandocMonad m => RoffLexer m MdocTokens +mdocToken :: PandocMonad m => Lexer m MdocTokens mdocToken = lexComment <|> lexControlLine <|> lexTextLine -lexMacro :: PandocMonad m => RoffLexer m MdocToken +lexMacro :: PandocMonad m => Lexer m MdocToken lexMacro = do pos <- getPosition name <- many1Char (satisfy isMacroChar) @@ -307,7 +154,7 @@ lexMacro = do isMacroChar '%' = True isMacroChar x = isAlphaNum x -lexDelim :: PandocMonad m => RoffLexer m MdocToken +lexDelim :: PandocMonad m => Lexer m MdocToken lexDelim = do pos <- getPosition t <- Delim Open <$> oneOfStrings ["(", "["] <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"] @@ -315,7 +162,7 @@ lexDelim = do skipSpaces return $ t pos -lexLit :: PandocMonad m => RoffLexer m MdocToken +lexLit :: PandocMonad m => Lexer m MdocToken lexLit = do pos <- getPosition t <- argText <|> quotedArg @@ -323,7 +170,7 @@ lexLit = do skipSpaces return $ Lit t pos -lexTextLine :: PandocMonad m => RoffLexer m MdocTokens +lexTextLine :: PandocMonad m => Lexer m MdocTokens lexTextLine = do pos <- getPosition guard $ sourceColumn pos == 1 @@ -333,7 +180,7 @@ lexTextLine = do then return $ singleTok $ Blank pos else return $ singleTok $ Str t pos -lexControlLine :: PandocMonad m => RoffLexer m MdocTokens +lexControlLine :: PandocMonad m => Lexer m MdocTokens lexControlLine = do pos <- getPosition guard $ sourceColumn pos == 1 diff --git a/src/Text/Pandoc/Readers/Roff.hs b/src/Text/Pandoc/Readers/Roff.hs index 429b90ce16ca..60e3ac6e7b8f 100644 --- a/src/Text/Pandoc/Readers/Roff.hs +++ b/src/Text/Pandoc/Readers/Roff.hs @@ -2,6 +2,7 @@ {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE TypeFamilies #-} {- | Module : Text.Pandoc.Readers.Roff Copyright : Copyright (C) 2018-2020 Yan Pashkovsky and John MacFarlane @@ -29,11 +30,11 @@ module Text.Pandoc.Readers.Roff where import Safe (lastDef) -import Control.Monad (void, mzero, mplus, guard) +import Control.Monad (void, guard) import Control.Monad.Except (throwError) import Text.Pandoc.Class.PandocMonad (getResourcePath, readFileFromDirs, PandocMonad(..), report) -import Data.Char (isLower, toLower, toUpper, chr, isAscii, isAlphaNum) +import Data.Char (isLower, toLower, toUpper, isAlphaNum) import Data.Default (Default) import qualified Data.Map as M import Data.List (intercalate) @@ -42,10 +43,9 @@ import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Shared (safeRead) -import Text.Pandoc.RoffChar (characterCodes, combiningAccents) +import Text.Pandoc.Readers.Roff.Escape import qualified Data.Sequence as Seq import qualified Data.Foldable as Foldable -import qualified Data.Text.Normalize as Normalize -- import Debug.Trace (traceShowId) @@ -65,6 +65,52 @@ data LinePart = RoffStr T.Text | MacroArg Int deriving Show +instance RoffMonad RoffTokens where + type Token RoffTokens = [LinePart] + type State RoffTokens = RoffState + emit t = [RoffStr t] + expandString = try $ do + pos <- getPosition + char '\\' + char '*' + cs <- escapeArg <|> countChar 1 anyChar + s <- linePartsToText <$> resolveText cs pos + addToInput s + escString = try $ do + pos <- getPosition + (do cs <- escapeArg <|> countChar 1 anyChar + resolveText cs pos) + <|> mempty <$ char 'S' + backslash = do + char '\\' + mode <- roffMode <$> getState + case mode of + -- experimentally, it seems you don't always need to double + -- the backslash in macro defs. It's essential with \\$1, + -- but not with \\f[I]. So we make the second one optional. + CopyMode -> optional $ char '\\' + NormalMode -> return () + escE = do + mode <- roffMode <$> getState + case mode of + CopyMode -> return mempty + NormalMode -> return [RoffStr "\\"] + escFont = do + font <- escapeArg <|> countChar 1 alphaNum + font' <- if T.null font || font == "P" + then prevFont <$> getState + else return $ foldr processFontLetter defaultFontSpec $ T.unpack font + updateState $ \st -> st{ prevFont = currentFont st + , currentFont = font' } + return [Font font'] + where + processFontLetter c fs + | isLower c = processFontLetter (toUpper c) fs + processFontLetter 'B' fs = fs{ fontBold = True } + processFontLetter 'I' fs = fs{ fontItalic = True } + processFontLetter 'C' fs = fs{ fontMonospace = True } + processFontLetter _ fs = fs -- do nothing + type Arg = [LinePart] type TableOption = (T.Text, T.Text) @@ -133,198 +179,6 @@ eofline = void newline <|> eof <|> () <$ lookAhead (string "\\}") spacetab :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m Char spacetab = char ' ' <|> char '\t' -characterCodeMap :: M.Map T.Text Char -characterCodeMap = - M.fromList $ map (\(x,y) -> (y,x)) characterCodes - -combiningAccentsMap :: M.Map T.Text Char -combiningAccentsMap = - M.fromList $ map (\(x,y) -> (y,x)) combiningAccents - -escape :: PandocMonad m => RoffLexer m [LinePart] -escape = try $ do - backslash - escapeGlyph <|> escapeNormal - -escapeGlyph :: PandocMonad m => RoffLexer m [LinePart] -escapeGlyph = do - c <- lookAhead (oneOf ['[','(']) - escapeArg >>= resolveGlyph c - -resolveGlyph :: PandocMonad m => Char -> T.Text -> RoffLexer m [LinePart] -resolveGlyph delimChar glyph = do - let cs = T.replace "_u" " u" glyph -- unicode glyphs separated by _ - (case T.words cs of - [] -> mzero - [s] -> case M.lookup s characterCodeMap `mplus` readUnicodeChar s of - Nothing -> mzero - Just c -> return [RoffStr $ T.singleton c] - (s:ss) -> do - basechar <- case M.lookup s characterCodeMap `mplus` - readUnicodeChar s of - Nothing -> - case T.unpack s of - [ch] | isAscii ch && isAlphaNum ch -> - return ch - _ -> mzero - Just c -> return c - let addAccents [] xs = return $ Normalize.normalize Normalize.NFC $ - T.reverse xs - addAccents (a:as) xs = - case M.lookup a combiningAccentsMap `mplus` readUnicodeChar a of - Just x -> addAccents as $ T.cons x xs - Nothing -> mzero - addAccents ss (T.singleton basechar) >>= \xs -> return [RoffStr xs]) - <|> case delimChar of - '[' -> escUnknown ("\\[" <> glyph <> "]") - '(' -> escUnknown ("\\(" <> glyph) - '\'' -> escUnknown ("\\C'" <> glyph <> "'") - _ -> Prelude.fail "resolveGlyph: unknown glyph delimiter" - -readUnicodeChar :: T.Text -> Maybe Char -readUnicodeChar t = case T.uncons t of - Just ('u', cs) | T.length cs > 3 -> chr <$> safeRead ("0x" <> cs) - _ -> Nothing - -escapeNormal :: PandocMonad m => RoffLexer m [LinePart] -escapeNormal = do - c <- noneOf "{}" - optional expandString - case c of - ' ' -> return [RoffStr " "] - '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment - '#' -> mempty <$ manyTill anyChar newline - '%' -> return mempty -- optional hyphenation - '&' -> return mempty -- nonprintable zero-width - ')' -> return mempty -- nonprintable zero-width - '*' -> escString - ',' -> return mempty -- to fix spacing after roman - '-' -> return [RoffStr "-"] - '.' -> return [RoffStr "."] - '/' -> return mempty -- to fix spacing before roman - '0' -> return [RoffStr "\x2007"] -- digit-width space - ':' -> return mempty -- zero-width break - 'A' -> quoteArg >>= checkDefined - 'B' -> escIgnore 'B' [quoteArg] - 'C' -> quoteArg >>= resolveGlyph '\'' - 'D' -> escIgnore 'D' [quoteArg] - 'E' -> do - mode <- roffMode <$> getState - case mode of - CopyMode -> return mempty - NormalMode -> return [RoffStr "\\"] - 'H' -> escIgnore 'H' [quoteArg] - 'L' -> escIgnore 'L' [quoteArg] - 'M' -> escIgnore 'M' [escapeArg, countChar 1 (satisfy (/='\n'))] - 'N' -> escIgnore 'N' [quoteArg] - 'O' -> escIgnore 'O' [countChar 1 (oneOf ['0','1'])] - 'R' -> escIgnore 'R' [quoteArg] - 'S' -> escIgnore 'S' [quoteArg] - 'V' -> escIgnore 'V' [escapeArg, countChar 1 alphaNum] - 'X' -> escIgnore 'X' [quoteArg] - 'Y' -> escIgnore 'Y' [escapeArg, countChar 1 (satisfy (/='\n'))] - 'Z' -> escIgnore 'Z' [quoteArg] - '\'' -> return [RoffStr "'"] - '\n' -> return mempty -- line continuation - '^' -> return [RoffStr "\x200A"] -- 1/12 em space - '_' -> return [RoffStr "_"] - '`' -> return [RoffStr "`"] - 'a' -> return mempty -- "non-interpreted leader character" - 'b' -> escIgnore 'b' [quoteArg] - 'c' -> return mempty -- interrupt text processing - 'd' -> escIgnore 'd' [] -- forward down 1/2em - 'e' -> return [RoffStr "\\"] - 'f' -> escFont - 'g' -> escIgnore 'g' [escapeArg, countChar 1 (satisfy (/='\n'))] - 'h' -> escIgnore 'h' [quoteArg] - 'k' -> escIgnore 'k' [escapeArg, countChar 1 (satisfy (/='\n'))] - 'l' -> escIgnore 'l' [quoteArg] - 'm' -> escIgnore 'm' [escapeArg, countChar 1 (satisfy (/='\n'))] - 'n' -> escIgnore 'm' [escapeArg, countChar 1 (satisfy (/='\n'))] - 'o' -> escIgnore 'o' [quoteArg] - 'p' -> escIgnore 'p' [] - 'r' -> escIgnore 'r' [] - 's' -> escIgnore 's' [escapeArg, signedNumber] - 't' -> return [RoffStr "\t"] - 'u' -> escIgnore 'u' [] - 'v' -> escIgnore 'v' [quoteArg] - 'w' -> escIgnore 'w' [quoteArg] - 'x' -> escIgnore 'x' [quoteArg] - 'z' -> escIgnore 'z' [countChar 1 anyChar] - '|' -> return [RoffStr "\x2006"] --1/6 em space - '~' -> return [RoffStr "\160"] -- nonbreaking space - '\\' -> do - mode <- roffMode <$> getState - case mode of - CopyMode -> char '\\' - NormalMode -> return '\\' - return [RoffStr "\\"] - _ -> return [RoffStr $ T.singleton c] - -- man 7 groff: "If a backslash is followed by a character that - -- does not constitute a defined escape sequence, the backslash - -- is silently ignored and the character maps to itself." - -escIgnore :: PandocMonad m - => Char - -> [RoffLexer m T.Text] - -> RoffLexer m [LinePart] -escIgnore c argparsers = do - pos <- getPosition - arg <- snd <$> withRaw (choice argparsers) <|> return "" - report $ SkippedContent ("\\" <> T.cons c arg) pos - return mempty - -escUnknown :: PandocMonad m => T.Text -> RoffLexer m [LinePart] -escUnknown s = do - pos <- getPosition - report $ SkippedContent s pos - return [RoffStr "\xFFFD"] - -signedNumber :: PandocMonad m => RoffLexer m T.Text -signedNumber = try $ do - sign <- option "" ("-" <$ char '-' <|> "" <$ char '+') - ds <- many1Char digit - return (sign <> ds) - --- Parses: [..] or (.. -escapeArg :: PandocMonad m => RoffLexer m T.Text -escapeArg = choice - [ char '[' *> optional expandString *> - manyTillChar (noneOf ['\n',']']) (char ']') - , char '(' *> optional expandString *> - countChar 2 (satisfy (/='\n')) - ] - -expandString :: PandocMonad m => RoffLexer m () -expandString = try $ do - pos <- getPosition - char '\\' - char '*' - cs <- escapeArg <|> countChar 1 anyChar - s <- linePartsToText <$> resolveText cs pos - addToInput s - --- Parses: '..' -quoteArg :: PandocMonad m => RoffLexer m T.Text -quoteArg = char '\'' *> manyTillChar (noneOf ['\n','\'']) (char '\'') - -escFont :: PandocMonad m => RoffLexer m [LinePart] -escFont = do - font <- escapeArg <|> countChar 1 alphaNum - font' <- if T.null font || font == "P" - then prevFont <$> getState - else return $ foldr processFontLetter defaultFontSpec $ T.unpack font - updateState $ \st -> st{ prevFont = currentFont st - , currentFont = font' } - return [Font font'] - where - processFontLetter c fs - | isLower c = processFontLetter (toUpper c) fs - processFontLetter 'B' fs = fs{ fontBold = True } - processFontLetter 'I' fs = fs{ fontItalic = True } - processFontLetter 'C' fs = fs{ fontMonospace = True } - processFontLetter _ fs = fs -- do nothing - -- separate function from lexMacro since real man files sometimes do not -- follow the rules lexComment :: PandocMonad m => RoffLexer m RoffTokens @@ -624,20 +478,6 @@ lexArgs = do char '"' return [RoffStr "\""] -checkDefined :: PandocMonad m => T.Text -> RoffLexer m [LinePart] -checkDefined name = do - macros <- customMacros <$> getState - case M.lookup name macros of - Just _ -> return [RoffStr "1"] - Nothing -> return [RoffStr "0"] - -escString :: PandocMonad m => RoffLexer m [LinePart] -escString = try $ do - pos <- getPosition - (do cs <- escapeArg <|> countChar 1 anyChar - resolveText cs pos) - <|> mempty <$ char 'S' - -- strings and macros share namespace resolveText :: PandocMonad m => T.Text -> SourcePos -> RoffLexer m [LinePart] @@ -668,17 +508,6 @@ linePart :: PandocMonad m => RoffLexer m [LinePart] linePart = macroArg <|> escape <|> regularText <|> quoteChar <|> spaceTabChar -backslash :: PandocMonad m => RoffLexer m () -backslash = do - char '\\' - mode <- roffMode <$> getState - case mode of - -- experimentally, it seems you don't always need to double - -- the backslash in macro defs. It's essential with \\$1, - -- but not with \\f[I]. So we make the second one optional. - CopyMode -> optional $ char '\\' - NormalMode -> return () - macroArg :: PandocMonad m => RoffLexer m [LinePart] macroArg = try $ do pos <- getPosition diff --git a/src/Text/Pandoc/Readers/Roff/Escape.hs b/src/Text/Pandoc/Readers/Roff/Escape.hs new file mode 100644 index 000000000000..109c8c92f1ba --- /dev/null +++ b/src/Text/Pandoc/Readers/Roff/Escape.hs @@ -0,0 +1,198 @@ +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE TypeFamilyDependencies #-} +module Text.Pandoc.Readers.Roff.Escape + ( escape, + escapeArg, + RoffMonad(..), + ) +where +import Text.Pandoc.Class.PandocMonad + ( PandocMonad(..), report, PandocMonad(..), report ) +import Control.Monad + ( mzero, mplus, mzero, mplus ) +import Data.Char (chr, isAscii, isAlphaNum) +import qualified Data.Map as M +import qualified Data.Text as T +import Text.Pandoc.Logging (LogMessage(..)) +import Text.Pandoc.Parsing +import Text.Pandoc.Shared (safeRead) +import qualified Data.Text.Normalize as Normalize +import Text.Pandoc.RoffChar (characterCodes, combiningAccents) + +type Lexer m x = ParsecT Sources (State x) m + +class (Monoid (Token x)) => RoffMonad x where + type State x = a | a -> x + type Token x = a | a -> x + emit :: T.Text -> Token x + expandString :: PandocMonad m => Lexer m x () + escString :: PandocMonad m => Lexer m x (Token x) + backslash :: PandocMonad m => Lexer m x () + checkDefined :: PandocMonad m => T.Text -> Lexer m x (Token x) + checkDefined _ = mempty + escE :: PandocMonad m => Lexer m x (Token x) + escE = return mempty + escFont :: PandocMonad m => Lexer m x (Token x) + escFont = escIgnore 'f' [escapeArg, countChar 1 (satisfy (/='\n'))] + + +characterCodeMap :: M.Map T.Text Char +characterCodeMap = + M.fromList $ map (\(x,y) -> (y,x)) characterCodes + +combiningAccentsMap :: M.Map T.Text Char +combiningAccentsMap = + M.fromList $ map (\(x,y) -> (y,x)) combiningAccents + +escape :: (PandocMonad m, RoffMonad x) => Lexer m x (Token x) +escape = try $ do + backslash + escapeGlyph <|> escapeNormal + +escapeGlyph :: (PandocMonad m, RoffMonad x) => Lexer m x (Token x) +escapeGlyph = do + c <- lookAhead (oneOf ['[','(']) + escapeArg >>= resolveGlyph c + +resolveGlyph :: (PandocMonad m, RoffMonad x) => Char -> T.Text -> Lexer m x (Token x) +resolveGlyph delimChar glyph = do + let cs = T.replace "_u" " u" glyph -- unicode glyphs separated by _ + (case T.words cs of + [] -> mzero + [s] -> case M.lookup s characterCodeMap `mplus` readUnicodeChar s of + Nothing -> mzero + Just c -> return $ emit $ T.singleton c + (s:ss) -> do + basechar <- case M.lookup s characterCodeMap `mplus` + readUnicodeChar s of + Nothing -> + case T.unpack s of + [ch] | isAscii ch && isAlphaNum ch -> + return ch + _ -> mzero + Just c -> return c + let addAccents [] xs = return $ Normalize.normalize Normalize.NFC $ + T.reverse xs + addAccents (a:as) xs = + case M.lookup a combiningAccentsMap `mplus` readUnicodeChar a of + Just x -> addAccents as $ T.cons x xs + Nothing -> mzero + addAccents ss (T.singleton basechar) >>= \xs -> return $ emit xs) + <|> case delimChar of + '[' -> escUnknown ("\\[" <> glyph <> "]") + '(' -> escUnknown ("\\(" <> glyph) + '\'' -> escUnknown ("\\C'" <> glyph <> "'") + _ -> Prelude.fail "resolveGlyph: unknown glyph delimiter" + +readUnicodeChar :: T.Text -> Maybe Char +readUnicodeChar t = case T.uncons t of + Just ('u', cs) | T.length cs > 3 -> chr <$> safeRead ("0x" <> cs) + _ -> Nothing + +escapeNormal :: (PandocMonad m, RoffMonad x) => Lexer m x (Token x) +escapeNormal = do + c <- noneOf "{}" + optional expandString + let groffSkip = [escapeArg, countChar 1 (satisfy (/='\n'))] + case c of + ' ' -> return $ emit " " -- mandoc_char(7) says this should be a nonbreaking space + '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment + '#' -> mempty <$ manyTill anyChar newline + '%' -> return mempty -- optional hyphenation + '&' -> return mempty -- nonprintable zero-width + ')' -> return mempty -- nonprintable zero-width + '*' -> escString + ',' -> return mempty -- to fix spacing after roman + '-' -> return $ emit "-" + '.' -> return $ emit "." + '/' -> return mempty -- to fix spacing before roman + '0' -> return $ emit "\x2007" -- digit-width space + ':' -> return mempty -- zero-width break + 'A' -> quoteArg >>= checkDefined + 'B' -> escIgnore 'B' [quoteArg] + 'C' -> quoteArg >>= resolveGlyph '\'' + 'D' -> escIgnore 'D' [quoteArg] + 'E' -> escE + 'F' -> escIgnore 'F' groffSkip + 'H' -> escIgnore 'H' [quoteArg] + 'L' -> escIgnore 'L' [quoteArg] + 'M' -> escIgnore 'M' groffSkip + 'N' -> escIgnore 'N' [quoteArg] + 'O' -> escIgnore 'O' [countChar 1 (oneOf ['0','1'])] + 'R' -> escIgnore 'R' [quoteArg] + 'S' -> escIgnore 'S' [quoteArg] + 'V' -> escIgnore 'V' groffSkip + 'X' -> escIgnore 'X' [quoteArg] + 'Y' -> escIgnore 'Y' groffSkip + 'Z' -> escIgnore 'Z' [quoteArg] + '\'' -> return $ emit "'" + '\n' -> return mempty -- line continuation + '^' -> return $ emit "\x200A" -- 1/12 em space + '_' -> return $ emit "_" + '`' -> return $ emit "`" + 'a' -> return mempty -- "non-interpreted leader character" + 'b' -> escIgnore 'b' [quoteArg] + 'c' -> return mempty -- interrupt text processing + 'd' -> escIgnore 'd' [] -- forward down 1/2em + 'e' -> return $ emit "\\" + 'f' -> escFont + 'g' -> escIgnore 'g' groffSkip + 'h' -> escIgnore 'h' [quoteArg] + 'k' -> escIgnore 'k' groffSkip + 'l' -> escIgnore 'l' [quoteArg] + 'm' -> escIgnore 'm' groffSkip + 'n' -> escIgnore 'm' groffSkip + 'o' -> escIgnore 'o' [quoteArg] + 'p' -> escIgnore 'p' [] + 'r' -> escIgnore 'r' [] + 's' -> escIgnore 's' [escapeArg, signedNumber] + 't' -> return $ emit "\t" + 'u' -> escIgnore 'u' [] + 'v' -> escIgnore 'v' [quoteArg] + 'w' -> escIgnore 'w' [quoteArg] + 'x' -> escIgnore 'x' [quoteArg] + 'z' -> escIgnore 'z' [countChar 1 anyChar] + '|' -> return $ emit "\x2006" --1/6 em space + '~' -> return $ emit "\160" -- nonbreaking space + '\\' -> return $ emit "\\" + _ -> return $ emit $ T.singleton c + -- man 7 groff: "If a backslash is followed by a character that + -- does not constitute a defined escape sequence, the backslash + -- is silently ignored and the character maps to itself." + +escIgnore :: (PandocMonad m, RoffMonad x) + => Char + -> [Lexer m x T.Text] + -> Lexer m x (Token x) +escIgnore c argparsers = do + pos <- getPosition + arg <- snd <$> withRaw (choice argparsers) <|> return "" + report $ SkippedContent ("\\" <> T.cons c arg) pos + return mempty + +escUnknown :: (PandocMonad m, RoffMonad x) => T.Text -> Lexer m x (Token x) +escUnknown s = do + pos <- getPosition + report $ SkippedContent s pos + return $ emit "\xFFFD" + +signedNumber :: (PandocMonad m, RoffMonad x) => Lexer m x T.Text +signedNumber = try $ do + sign <- option "" ("-" <$ char '-' <|> "" <$ char '+') + ds <- many1Char digit + return (sign <> ds) + +-- Parses: [..] or (.. +escapeArg :: (PandocMonad m, RoffMonad x) => Lexer m x T.Text +escapeArg = choice + [ char '[' *> optional expandString *> + manyTillChar (noneOf ['\n',']']) (char ']') + , char '(' *> optional expandString *> + countChar 2 (satisfy (/='\n')) + ] + +-- Parses: '..' +quoteArg :: (PandocMonad m, RoffMonad x) => Lexer m x T.Text +quoteArg = char '\'' *> manyTillChar (noneOf ['\n','\'']) (char '\'') From a714f9ddb8d8fe8cbfdbb969e3da52738628de23 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 12 Sep 2024 16:14:54 -0700 Subject: [PATCH 18/81] Comment upon my approach --- src/Text/Pandoc/Readers/Mdoc.hs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 218130c9934d..a7da4c2e0100 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -30,6 +30,11 @@ import qualified Text.Pandoc.Parsing as P import qualified Data.Foldable as Foldable import Text.Pandoc.Shared (stringify) + {- As a general principle, if mandoc -T lint issues a WARNING admonition + or worse about a construct, I consider it fair game for this reader to + do something different than what mandoc does with it, including bailing + out instead of recovering. -} + data MdocSection = ShName | ShSynopsis From 44316511ec1fcdc296cc7128543a86e5bb96d547 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 12 Sep 2024 16:17:02 -0700 Subject: [PATCH 19/81] Add Xr parser --- src/Text/Pandoc/Readers/Mdoc.hs | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index a7da4c2e0100..6b0954bf5643 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -232,6 +232,14 @@ litsToText = do let strs = map (B.str . toString) ls return $ mconcat $ intersperse B.space strs +delimitedArgs :: PandocMonad m => MdocParser m x -> MdocParser m (Inlines, x, Inlines) +delimitedArgs p = do + openDelim <- mconcat <$> many (parseDelim Open) + inlines <- p + closeDelim <- mconcat <$> many (parseDelim Close) + return (openDelim, inlines, closeDelim) + +-- TODO extract further? simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines simpleInline nm xform = do macro nm @@ -239,9 +247,7 @@ simpleInline nm xform = do return $ mconcat $ intersperse B.space segs where segment = do - openDelim <- mconcat <$> many (parseDelim Open) - inlines <- option mempty litsToText - closeDelim <- mconcat <$> many (parseDelim Close) + (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText let xform' x = if null x then mempty else xform x return $ openDelim <> xform' inlines <> closeDelim @@ -255,8 +261,22 @@ parseSy = simpleInline "Sy" B.strong parseEm :: PandocMonad m => MdocParser m Inlines parseEm = simpleInline "Em" B.emph +-- Xr +parseXr :: PandocMonad m => MdocParser m Inlines +parseXr = do + macro "Xr" + (open, (name, section), close) <- delimitedArgs f + let ref = name <> "(" <> section <> ")" + return $ open <> B.spanWith attr (B.str ref) <> close + where + f = do + n <- lit "Xr manual name" + s <- lit "Xr manual section" + return (toString n, toString s) + attr = (mempty, ["Xr"], mempty) + parseInlineMacro :: PandocMonad m => MdocParser m Inlines -parseInlineMacro = choice [ parseSy, parseEm ] +parseInlineMacro = choice [ parseSy, parseEm, parseXr ] -- TODO this doesn't handle inline macros being interrupted -- by other ones yet, but the lexer doesn't handle it yet From 759d327de20e71b0dbe9dfb8e26f17892e5e9977 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 12 Sep 2024 17:32:40 -0700 Subject: [PATCH 20/81] Lex callable macros in parsed control lines See mdoc(7) section MACRO SYNTAX --- pandoc.cabal | 2 + src/Text/Pandoc/Readers/Mdoc/Lex.hs | 15 +- src/Text/Pandoc/Readers/Mdoc/Macros.hs | 185 +++++++++++++++++++++++++ 3 files changed, 200 insertions(+), 2 deletions(-) create mode 100644 src/Text/Pandoc/Readers/Mdoc/Macros.hs diff --git a/pandoc.cabal b/pandoc.cabal index b885c3db52f1..39508116c1fe 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -522,6 +522,7 @@ library split >= 0.2 && < 0.3, syb >= 0.1 && < 0.8, tagsoup >= 0.14.6 && < 0.15, + template-haskell, temporary >= 1.1 && < 1.4, texmath >= 0.12.8.10 && < 0.13, text >= 1.1.1.0 && < 2.2, @@ -702,6 +703,7 @@ library Text.Pandoc.Readers.LaTeX.Parsing, Text.Pandoc.Readers.LaTeX.SIunitx, Text.Pandoc.Readers.LaTeX.Table, + Text.Pandoc.Readers.Mdoc.Macros, Text.Pandoc.Readers.Typst.Parsing, Text.Pandoc.Readers.Typst.Math, Text.Pandoc.Readers.ODT.Base, diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs index 03e3f999aa35..db58ac27d204 100644 --- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -33,6 +33,7 @@ import qualified Data.Text as T import Text.Pandoc.Options import Text.Pandoc.Parsing import Text.Pandoc.Readers.Roff.Escape +import Text.Pandoc.Readers.Mdoc.Macros import qualified Data.Sequence as Seq -- import Debug.Trace (traceShowId) @@ -154,6 +155,12 @@ lexMacro = do isMacroChar '%' = True isMacroChar x = isAlphaNum x +lexCallableMacro :: PandocMonad m => Lexer m MdocToken +lexCallableMacro = do + m@(Macro name _) <- lexMacro + guard $ isCallableMacro name + return m + lexDelim :: PandocMonad m => Lexer m MdocToken lexDelim = do pos <- getPosition @@ -185,9 +192,13 @@ lexControlLine = do pos <- getPosition guard $ sourceColumn pos == 1 char '.' - m <- lexMacro - (wds, e) <- manyUntil (try lexDelim <|> lexLit) eofline + m@(Macro name _) <- lexMacro + let parsed = isParsedMacro name + (wds, e) <- manyUntil (l parsed) eofline return $ MdocTokens $ Seq.fromList $ (m:wds) <> [e] + where + l True = try lexDelim <|> try lexCallableMacro <|> lexLit + l False = try lexDelim <|> lexLit -- | Tokenize a string as a sequence of roff tokens. lexMdoc :: PandocMonad m => SourcePos -> T.Text -> m MdocTokens diff --git a/src/Text/Pandoc/Readers/Mdoc/Macros.hs b/src/Text/Pandoc/Readers/Mdoc/Macros.hs new file mode 100644 index 000000000000..174a0de39bbd --- /dev/null +++ b/src/Text/Pandoc/Readers/Mdoc/Macros.hs @@ -0,0 +1,185 @@ +{-# LANGUAGE TemplateHaskell #-} +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.Mdoc.Macros + Copyright : + License : GNU GPL, version 2 or above + + Maintainer : a + Stability : WIP + Portability : portable + +-} +module Text.Pandoc.Readers.Mdoc.Macros (isParsedMacro, isCallableMacro) where + +import Data.Set (member, fromList, Set) +import Data.Text +import Language.Haskell.TH.Syntax (lift) + +isParsedMacro :: Text -> Bool +isParsedMacro a = member a parsedMacros + +isCallableMacro :: Text -> Bool +isCallableMacro a = member a callableMacros + +parsedMacros :: Set Text +parsedMacros = $(lift (fromList [ + "Ac", + "Ad", + "An", + "Ao", + "Ap", + "Aq", + "Ar", + "At", + "Bc", + "Bo", + "Bq", + "Brc", + "Bro", + "Brq", + "Bsx", + "Bx", + "Cd", + "Cm", + "D1", + "Dc", + "Dl", + "Do", + "Dq", + "Dv", + "Dx", + "Ec", + "Em", + "En", + "Eo", + "Er", + "Es", + "Ev", + "Fa", + "Fc", + "Fl", + "Fn", + "Fr", + "Ft", + "Fx", + "Ic", + "In", + "It", + "Li", + "Lk", + "Ms", + "Mt", + "Nm", + "No", + "Ns", + "Nx", + "Oc", + "Oo", + "Op", + "Ot", + "Ox", + "Pa", + "Pc", + "Pf", + "Po", + "Pq", + "Qc", + "Ql", + "Qo", + "Qq", + "Sc", + "Sh", + "So", + "Sq", + "Ss", + "St", + "Sx", + "Sy", + "Ta", + "Tn", + "Ux", + "Va", + "Vt", + "Xc", + "Xo", + "Xr"] :: Set Text)) + +callableMacros :: Set Text +callableMacros = $(lift (fromList [ + "Ac", + "Ad", + "An", + "Ao", + "Ap", + "Aq", + "Ar", + "At", + "Bc", + "Bo", + "Bq", + "Brc", + "Bro", + "Brq", + "Bsx", + "Bx", + "Cd", + "Cm", + "Dc", + "Do", + "Dq", + "Dv", + "Dx", + "Ec", + "Em", + "En", + "Eo", + "Er", + "Es", + "Ev", + "Fa", + "Fc", + "Fl", + "Fn", + "Fo", + "Fr", + "Ft", + "Fx", + "Ic", + "In", + "Li", + "Lk", + "Ms", + "Mt", + "Nm", + "No", + "Ns", + "Nx", + "Oc", + "Oo", + "Op", + "Ot", + "Ox", + "Pa", + "Pc", + "Pf", + "Po", + "Pq", + "Qc", + "Ql", + "Qo", + "Qq", + "Sc", + "So", + "Sq", + "St", + "Sx", + "Sy", + "Ta", + "Tn", + "Ux", + "Va", + "Vt", + "Xc", + "Xo", + "Xr"] :: Set Text)) From 784a873628912abc489eea604b05beaa36827403 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 12 Sep 2024 19:07:19 -0700 Subject: [PATCH 21/81] Correct comments --- src/Text/Pandoc/Readers/Mdoc.hs | 2 +- src/Text/Pandoc/Readers/Mdoc/Lex.hs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 6b0954bf5643..46def38d15e3 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -9,7 +9,7 @@ Stability : WIP Portability : portable -Conversion of man to 'Pandoc' document. +Conversion of mdoc to 'Pandoc' document. -} module Text.Pandoc.Readers.Mdoc (readMdoc) where diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs index db58ac27d204..6cb0a1bd2608 100644 --- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -13,7 +13,7 @@ Stability : WIP Portability : portable -Tokenizer for roff formats (man, ms). +Tokenizer for mdoc format -} module Text.Pandoc.Readers.Mdoc.Lex ( MdocToken(..) From 1e8f7b700fed28aba71acd9717e594f28619292d Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 12 Sep 2024 19:21:55 -0700 Subject: [PATCH 22/81] Parse Nm macro (outside Sh NAME) --- src/Text/Pandoc/Readers/Mdoc.hs | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 46def38d15e3..ec7498eaf0fd 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -248,18 +248,36 @@ simpleInline nm xform = do where segment = do (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText - let xform' x = if null x then mempty else xform x - return $ openDelim <> xform' inlines <> closeDelim + return $ openDelim <> xform inlines <> closeDelim +eliminateEmpty :: (Inlines -> Inlines) -> Inlines -> Inlines +eliminateEmpty x y = if null y then mempty else x y + +cls :: T.Text -> B.Attr +cls x = (mempty, [x], mempty) -- Sy: callable, parsed, >0 arguments -- mandoc -T html formats Sy with a tag, since it's not really -- semantically , but Strong is our best option in Pandoc parseSy :: PandocMonad m => MdocParser m Inlines -parseSy = simpleInline "Sy" B.strong +parseSy = simpleInline "Sy" (eliminateEmpty B.strong) parseEm :: PandocMonad m => MdocParser m Inlines -parseEm = simpleInline "Em" B.emph +parseEm = simpleInline "Em" (eliminateEmpty B.emph) + +parseNm :: PandocMonad m => MdocParser m Inlines +parseNm = do + mnm <- (progName <$> getState) + case mnm of + Nothing -> do + (_, nm, _) <- lookAhead $ delimitedArgs $ option mempty litsToText + guard $ not (null nm) + simpleInline "Nm" ok + Just nm -> simpleInline "Nm" $ \x -> + if null x + then B.codeWith (cls "Nm") nm + else ok x + where ok = B.codeWith (cls "Nm") . stringify -- Xr parseXr :: PandocMonad m => MdocParser m Inlines @@ -267,16 +285,15 @@ parseXr = do macro "Xr" (open, (name, section), close) <- delimitedArgs f let ref = name <> "(" <> section <> ")" - return $ open <> B.spanWith attr (B.str ref) <> close + return $ open <> B.spanWith (cls "Xr") (B.str ref) <> close where f = do n <- lit "Xr manual name" s <- lit "Xr manual section" return (toString n, toString s) - attr = (mempty, ["Xr"], mempty) parseInlineMacro :: PandocMonad m => MdocParser m Inlines -parseInlineMacro = choice [ parseSy, parseEm, parseXr ] +parseInlineMacro = choice [ parseSy, parseEm, parseNm, parseXr ] -- TODO this doesn't handle inline macros being interrupted -- by other ones yet, but the lexer doesn't handle it yet From 57c97095b23dd84e72f1d3468fdc0b7437812dfb Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 12 Sep 2024 19:23:09 -0700 Subject: [PATCH 23/81] Parse multiple inline macros per control line --- src/Text/Pandoc/Readers/Mdoc.hs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index ec7498eaf0fd..a73a1fcc6783 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -295,11 +295,8 @@ parseXr = do parseInlineMacro :: PandocMonad m => MdocParser m Inlines parseInlineMacro = choice [ parseSy, parseEm, parseNm, parseXr ] --- TODO this doesn't handle inline macros being interrupted --- by other ones yet, but the lexer doesn't handle it yet --- either parseInline :: PandocMonad m => MdocParser m Inlines -parseInline = parseStr <|> (parseInlineMacro <* eol) +parseInline = parseStr <|> mconcat <$> (many1Till (parseInlineMacro <|> litsToText) eol) parseInlines :: PandocMonad m => MdocParser m Inlines parseInlines = mconcat . intersperse B.space <$> many1 parseInline From 34a83d91770f4253d1f7844ac81a978a3758082f Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 12 Sep 2024 19:23:27 -0700 Subject: [PATCH 24/81] Add a deprecated macro synonym --- src/Text/Pandoc/Readers/Mdoc.hs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index a73a1fcc6783..c41a5a31bf3a 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -298,12 +298,13 @@ parseInlineMacro = choice [ parseSy, parseEm, parseNm, parseXr ] parseInline :: PandocMonad m => MdocParser m Inlines parseInline = parseStr <|> mconcat <$> (many1Till (parseInlineMacro <|> litsToText) eol) +-- TODO probably need some kind of fold to deal with Ns parseInlines :: PandocMonad m => MdocParser m Inlines parseInlines = mconcat . intersperse B.space <$> many1 parseInline parsePara :: PandocMonad m => MdocParser m Blocks parsePara = do - optional (emptyMacro "Pp") + optional (emptyMacro "Pp" <|> emptyMacro "Lp") -- Lp: deprecated synonym for Pp B.para . B.trimInlines <$> parseInlines -- CodeBlocks can't contain any other markup, but mdoc From a3a892ab552b0748244bb60503bb2e9e0e714fa5 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sat, 14 Sep 2024 11:26:00 -0700 Subject: [PATCH 25/81] Plan for special handling of AUTHORS --- src/Text/Pandoc/Readers/Mdoc.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index c41a5a31bf3a..679888473d2e 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -38,6 +38,7 @@ import Text.Pandoc.Shared (stringify) data MdocSection = ShName | ShSynopsis + | ShAuthors | ShOther deriving (Show, Eq) @@ -179,6 +180,7 @@ parsePrologue = do shToSectionMode :: T.Text -> MdocSection shToSectionMode "NAME" = ShName shToSectionMode "SYNOPSIS" = ShSynopsis +shToSectionMode "AUTHORS" = ShAuthors shToSectionMode _ = ShOther parseHeader :: PandocMonad m => MdocParser m Blocks From 4b3ecb86d9eb16cf9b169b7c326d5e005cf3802d Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sat, 14 Sep 2024 11:47:35 -0700 Subject: [PATCH 26/81] Correct pasted comment --- src/Text/Pandoc/Readers/Mdoc.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 679888473d2e..fbcfcafdf665 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -59,7 +59,7 @@ instance Default ManState where type MdocParser m = P.ParsecT [MdocToken] ManState m --- | Read man (troff) from an input string and return a Pandoc document. +-- | Read mdoc from an input string and return a Pandoc document. readMdoc :: (PandocMonad m, ToSources a) => ReaderOptions -> a From 91f24a042a1c299e3e677a63a1c764932f93ff23 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sat, 14 Sep 2024 12:41:13 -0700 Subject: [PATCH 27/81] Add macros for line-scoped enclosures --- src/Text/Pandoc/Readers/Mdoc.hs | 52 ++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index fbcfcafdf665..ab14590fd883 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -252,6 +252,12 @@ simpleInline nm xform = do (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText return $ openDelim <> xform inlines <> closeDelim +lineEnclosure :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines +lineEnclosure nm xform = do + macro nm + inner <- many1 (parseInlineMacro <|> litsToText) + return $ (xform . mconcat . intersperse B.space) inner + eliminateEmpty :: (Inlines -> Inlines) -> Inlines -> Inlines eliminateEmpty x y = if null y then mempty else x y @@ -267,6 +273,27 @@ parseSy = simpleInline "Sy" (eliminateEmpty B.strong) parseEm :: PandocMonad m => MdocParser m Inlines parseEm = simpleInline "Em" (eliminateEmpty B.emph) +parseQl :: PandocMonad m => MdocParser m Inlines +parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify + +parseDq :: PandocMonad m => MdocParser m Inlines +parseDq = lineEnclosure "Dq" B.doubleQuoted + +parseSq :: PandocMonad m => MdocParser m Inlines +parseSq = lineEnclosure "Sq" B.singleQuoted + +parsePq :: PandocMonad m => MdocParser m Inlines +parsePq = lineEnclosure "Pq" $ \x -> "(" <> x <> ")" + +parseBq :: PandocMonad m => MdocParser m Inlines +parseBq = lineEnclosure "Bq" $ \x -> "[" <> x <> "]" + +parseBrq :: PandocMonad m => MdocParser m Inlines +parseBrq = lineEnclosure "Brq" $ \x -> "{" <> x <> "}" + +parseAq :: PandocMonad m => MdocParser m Inlines +parseAq = lineEnclosure "Aq" $ \x -> "⟨" <> x <> "⟩" + parseNm :: PandocMonad m => MdocParser m Inlines parseNm = do mnm <- (progName <$> getState) @@ -294,15 +321,32 @@ parseXr = do s <- lit "Xr manual section" return (toString n, toString s) +-- TODO should possibly rename this function b/c some of these are +-- Mdoc block partial-implicit macros. Unclear if this distinction +-- is going to be relevant. parseInlineMacro :: PandocMonad m => MdocParser m Inlines -parseInlineMacro = choice [ parseSy, parseEm, parseNm, parseXr ] +parseInlineMacro = + choice + [ parseSy, + parseEm, + parseNm, + parseXr, + parseQl, + parseSq, + parseDq, + parsePq, + parseBq, + parseBrq, + parseAq + ] + +parseInline :: PandocMonad m => MdocParser m [Inlines] +parseInline = (parseStr >>= return . (:[])) <|> (many1Till (parseInlineMacro <|> litsToText) eol) -parseInline :: PandocMonad m => MdocParser m Inlines -parseInline = parseStr <|> mconcat <$> (many1Till (parseInlineMacro <|> litsToText) eol) -- TODO probably need some kind of fold to deal with Ns parseInlines :: PandocMonad m => MdocParser m Inlines -parseInlines = mconcat . intersperse B.space <$> many1 parseInline +parseInlines = mconcat . intersperse B.space . mconcat <$> many1 parseInline parsePara :: PandocMonad m => MdocParser m Blocks parsePara = do From 497477d99bfa3b43ebbe9ecbf8ca22fa72846268 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 11:10:47 -0700 Subject: [PATCH 28/81] Add Qq --- src/Text/Pandoc/Readers/Mdoc.hs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index ab14590fd883..8571455afa37 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -282,6 +282,9 @@ parseDq = lineEnclosure "Dq" B.doubleQuoted parseSq :: PandocMonad m => MdocParser m Inlines parseSq = lineEnclosure "Sq" B.singleQuoted +parseQq :: PandocMonad m => MdocParser m Inlines +parseQq = lineEnclosure "Qq" $ \x -> "\"" <> x <> "\"" + parsePq :: PandocMonad m => MdocParser m Inlines parsePq = lineEnclosure "Pq" $ \x -> "(" <> x <> ")" From 713192393723b4ac12ebd19c7cc9b208f77c3a99 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 14:16:51 -0700 Subject: [PATCH 29/81] Extract spacify function This will handle Ns in the future --- src/Text/Pandoc/Readers/Mdoc.hs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 8571455afa37..bd7223cd2993 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -163,7 +163,7 @@ argsToInlines :: PandocMonad m => MdocParser m Inlines argsToInlines = do ls <- manyTill arg eol let strs = map (B.str . toString) ls - return $ mconcat $ intersperse B.space strs + return $ spacify strs parsePrologue :: PandocMonad m => MdocParser m () parsePrologue = do @@ -232,7 +232,7 @@ litsToText :: PandocMonad m => MdocParser m Inlines litsToText = do ls <- many1 lit let strs = map (B.str . toString) ls - return $ mconcat $ intersperse B.space strs + return $ spacify strs delimitedArgs :: PandocMonad m => MdocParser m x -> MdocParser m (Inlines, x, Inlines) delimitedArgs p = do @@ -246,7 +246,7 @@ simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m simpleInline nm xform = do macro nm segs <- manyTill segment inlineContextEnd - return $ mconcat $ intersperse B.space segs + return $ spacify segs where segment = do (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText @@ -258,6 +258,9 @@ lineEnclosure nm xform = do inner <- many1 (parseInlineMacro <|> litsToText) return $ (xform . mconcat . intersperse B.space) inner +spacify :: [Inlines] -> Inlines +spacify = mconcat . intersperse B.space + eliminateEmpty :: (Inlines -> Inlines) -> Inlines -> Inlines eliminateEmpty x y = if null y then mempty else x y From 1f5da9a9358b1552f3ab27263ae53be6e367c5a7 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 14:18:29 -0700 Subject: [PATCH 30/81] Rework enclosures and support multiline ones --- src/Text/Pandoc/Readers/Mdoc.hs | 42 ++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index bd7223cd2993..98d817e09f3a 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -234,6 +234,16 @@ litsToText = do let strs = map (B.str . toString) ls return $ spacify strs +litsAndDelimsToText :: PandocMonad m => MdocParser m Inlines +litsAndDelimsToText = do + ods <- mconcat <$> many (parseDelim Open) + ls <- many lit + cds <- mconcat <$> if null ods && null ls + then many1 (parseDelim Close) + else many (parseDelim Close) + let strs = map (B.str . toString) ls + return $ ods <> spacify strs <> cds + delimitedArgs :: PandocMonad m => MdocParser m x -> MdocParser m (Inlines, x, Inlines) delimitedArgs p = do openDelim <- mconcat <$> many (parseDelim Open) @@ -255,12 +265,27 @@ simpleInline nm xform = do lineEnclosure :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines lineEnclosure nm xform = do macro nm - inner <- many1 (parseInlineMacro <|> litsToText) - return $ (xform . mconcat . intersperse B.space) inner + --- XXX wtf + (first, further, finally) <- delimitedArgs + (manyTill + (parseInlineMacro <|> (try (litsAndDelimsToText <* notFollowedBy eol)) + <|> litsToText) (lookAhead (many (delim Close) *> eol))) + return $ first <> xform (spacify further) <> finally spacify :: [Inlines] -> Inlines spacify = mconcat . intersperse B.space +{- Compatibility note: mandoc permits, and doesn't warn on, "vertical" macros + (Pp, Bl/El, Bd/Ed) inside of "horizontal" block partial-explicit quotations +like Do/Dc. However there are no OpenBSD manual pages that employ such markup +and it doesn't look right when rendered. We don't attempt to consume anything +but pandoc inlines inside of these multiline enclosures. -} +multilineEnclosure :: PandocMonad m => T.Text -> T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines +multilineEnclosure op cl xform = do + macro op + (first, further, finally) <- delimitedArgs (manyTill parseInlines (macro cl)) + return $ first <> xform (spacify further) <> finally + eliminateEmpty :: (Inlines -> Inlines) -> Inlines -> Inlines eliminateEmpty x y = if null y then mempty else x y @@ -282,6 +307,9 @@ parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify parseDq :: PandocMonad m => MdocParser m Inlines parseDq = lineEnclosure "Dq" B.doubleQuoted +parseDo :: PandocMonad m => MdocParser m Inlines +parseDo = ptrace $ multilineEnclosure "Do" "Dc" B.doubleQuoted + parseSq :: PandocMonad m => MdocParser m Inlines parseSq = lineEnclosure "Sq" B.singleQuoted @@ -343,16 +371,18 @@ parseInlineMacro = parsePq, parseBq, parseBrq, - parseAq + parseAq, + parseDo ] -parseInline :: PandocMonad m => MdocParser m [Inlines] -parseInline = (parseStr >>= return . (:[])) <|> (many1Till (parseInlineMacro <|> litsToText) eol) +parseInline :: PandocMonad m => MdocParser m Inlines +parseInline = parseStr <|> + ((parseInlineMacro <|> litsAndDelimsToText) <* optional eol) -- TODO probably need some kind of fold to deal with Ns parseInlines :: PandocMonad m => MdocParser m Inlines -parseInlines = mconcat . intersperse B.space . mconcat <$> many1 parseInline +parseInlines = spacify <$> many1 parseInline parsePara :: PandocMonad m => MdocParser m Blocks parsePara = do From 10bdda8c86e54a084b471cef092c07f37b68b790 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 14:19:20 -0700 Subject: [PATCH 31/81] fixup: qq --- src/Text/Pandoc/Readers/Mdoc.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 98d817e09f3a..0a23ea2a7b35 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -368,6 +368,7 @@ parseInlineMacro = parseQl, parseSq, parseDq, + parseQq, parsePq, parseBq, parseBrq, From 41f1454f638933abc04c4d86722c2903fdc6c12c Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 14:36:37 -0700 Subject: [PATCH 32/81] Add Op macro Which we just treat as another kind of Bq, for now --- src/Text/Pandoc/Readers/Mdoc.hs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 0a23ea2a7b35..5fee2d6c044e 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -322,6 +322,12 @@ parsePq = lineEnclosure "Pq" $ \x -> "(" <> x <> ")" parseBq :: PandocMonad m => MdocParser m Inlines parseBq = lineEnclosure "Bq" $ \x -> "[" <> x <> "]" +-- For our purposes this probably behaves identically to Bq +-- in most circumstances but I might need to do something +-- special with it in SYNOPSIS +parseOp :: PandocMonad m => MdocParser m Inlines +parseOp = lineEnclosure "Op" $ \x -> "[" <> x <> "]" + parseBrq :: PandocMonad m => MdocParser m Inlines parseBrq = lineEnclosure "Brq" $ \x -> "{" <> x <> "}" @@ -366,6 +372,7 @@ parseInlineMacro = parseNm, parseXr, parseQl, + parseOp, parseSq, parseDq, parseQq, From 03d82ba87c932599caa442b4ac17e64e3e5fd649 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 14:37:37 -0700 Subject: [PATCH 33/81] Add partial Ns support We parse this into a RawInline, which will usually get removed by the spacify function, once I write the version of this that actually does that. --- src/Text/Pandoc/Readers/Mdoc.hs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 5fee2d6c044e..831d81d53ea5 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -361,6 +361,9 @@ parseXr = do s <- lit "Xr manual section" return (toString n, toString s) +parseNs :: PandocMonad m => MdocParser m Inlines +parseNs = macro "Ns" >> return (B.rawInline "mdoc" "Ns") + -- TODO should possibly rename this function b/c some of these are -- Mdoc block partial-implicit macros. Unclear if this distinction -- is going to be relevant. @@ -380,7 +383,8 @@ parseInlineMacro = parseBq, parseBrq, parseAq, - parseDo + parseDo, + parseNs ] parseInline :: PandocMonad m => MdocParser m Inlines From ffe16cceaa17d0f5af144abff298324922a82d0a Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 18:01:12 -0700 Subject: [PATCH 34/81] Lex called macros better --- src/Text/Pandoc/Readers/Mdoc/Lex.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs index 6cb0a1bd2608..350a4ffb30a2 100644 --- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -149,6 +149,7 @@ lexMacro :: PandocMonad m => Lexer m MdocToken lexMacro = do pos <- getPosition name <- many1Char (satisfy isMacroChar) + eof <|> void (lookAhead (spaceChar <|> newline)) skipSpaces return $ Macro name pos where From 87097af014e3a9c1e50e6727417a1988f76727d4 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 18:01:36 -0700 Subject: [PATCH 35/81] Lex inner quotes in quoted args correctly --- src/Text/Pandoc/Readers/Mdoc/Lex.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs index 350a4ffb30a2..ac3a753aac06 100644 --- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -124,7 +124,7 @@ spaceTabChar = T.singleton <$> spaceChar quotedArg :: PandocMonad m => Lexer m T.Text quotedArg = do quoteChar - t <- mconcat <$> many (escape <|> regularText <|> innerQuote <|> spaceTabChar) + t <- mconcat <$> many (try innerQuote <|> escape <|> regularText <|> spaceTabChar) quoteChar notFollowedBy quoteChar return t From fbd18a24b04fe6d7c5e36a6375fc4ef3846230bd Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 21:07:35 -0700 Subject: [PATCH 36/81] Handle the Ns macro --- src/Text/Pandoc/Readers/Mdoc.hs | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 831d81d53ea5..254932790764 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -272,8 +272,28 @@ lineEnclosure nm xform = do <|> litsToText) (lookAhead (many (delim Close) *> eol))) return $ first <> xform (spacify further) <> finally +noSpace :: Inlines +noSpace = B.rawInline "mdoc" "Ns" + +data SpacifyState = SpacifyState { accum :: [Inlines], prev :: Inlines, ns :: Bool } +instance Default SpacifyState where def = SpacifyState [] mempty False + +foldNoSpaces :: [Inlines] -> [Inlines] +foldNoSpaces xs = (finalize . foldl go def) xs + where + go :: SpacifyState -> Inlines -> SpacifyState + go s x + | ns s && x == noSpace = s + | ns s = s{prev = prev s <> x, ns = False} + | x == noSpace = s{ns = True} + | null (prev s) = s{prev = x} + | otherwise = s{accum = accum s <> [prev s], prev = x} + finalize s + | null (prev s) = accum s + | otherwise = accum s <> [prev s] + spacify :: [Inlines] -> Inlines -spacify = mconcat . intersperse B.space +spacify = mconcat . intersperse B.space . foldNoSpaces {- Compatibility note: mandoc permits, and doesn't warn on, "vertical" macros (Pp, Bl/El, Bd/Ed) inside of "horizontal" block partial-explicit quotations @@ -362,7 +382,7 @@ parseXr = do return (toString n, toString s) parseNs :: PandocMonad m => MdocParser m Inlines -parseNs = macro "Ns" >> return (B.rawInline "mdoc" "Ns") +parseNs = macro "Ns" >> return noSpace -- TODO should possibly rename this function b/c some of these are -- Mdoc block partial-implicit macros. Unclear if this distinction From f9888047294f4a2bfa4e0af1a8be822da03a2826 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 21:08:11 -0700 Subject: [PATCH 37/81] fixup: Remove Do trace --- src/Text/Pandoc/Readers/Mdoc.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 254932790764..8977e3d6b3f3 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -328,7 +328,7 @@ parseDq :: PandocMonad m => MdocParser m Inlines parseDq = lineEnclosure "Dq" B.doubleQuoted parseDo :: PandocMonad m => MdocParser m Inlines -parseDo = ptrace $ multilineEnclosure "Do" "Dc" B.doubleQuoted +parseDo = multilineEnclosure "Do" "Dc" B.doubleQuoted parseSq :: PandocMonad m => MdocParser m Inlines parseSq = lineEnclosure "Sq" B.singleQuoted From 83a523d23b15b330057aaefefa08105ba58411da Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 21:09:04 -0700 Subject: [PATCH 38/81] Lex quoted delimiters on control lines --- src/Text/Pandoc/Readers/Mdoc/Lex.hs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs index ac3a753aac06..fe50f64f02ac 100644 --- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -24,11 +24,12 @@ module Text.Pandoc.Readers.Mdoc.Lex ) where -import Control.Monad (void, guard) +import Control.Monad (void, guard, when) import Control.Monad.Except (throwError) import Text.Pandoc.Class.PandocMonad (PandocMonad(..)) import Data.Char (isAlphaNum) import Data.Default (Default) +import Data.Maybe (isJust) import qualified Data.Text as T import Text.Pandoc.Options import Text.Pandoc.Parsing @@ -165,7 +166,9 @@ lexCallableMacro = do lexDelim :: PandocMonad m => Lexer m MdocToken lexDelim = do pos <- getPosition + q <- optionMaybe quoteChar t <- Delim Open <$> oneOfStrings ["(", "["] <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"] + when (isJust q) (void quoteChar) eof <|> void (lookAhead (spaceChar <|> newline)) skipSpaces return $ t pos From 6ed3d8387f25d5267aed4b0086159d57f7f799e1 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 21:09:39 -0700 Subject: [PATCH 39/81] Start Mdoc tests --- pandoc.cabal | 1 + test/Tests/Readers/Mdoc.hs | 71 ++++++++++++++++++++++++++++++++++++++ test/test-pandoc.hs | 2 ++ 3 files changed, 74 insertions(+) create mode 100644 test/Tests/Readers/Mdoc.hs diff --git a/pandoc.cabal b/pandoc.cabal index 39508116c1fe..e36eb4a9ed1f 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -827,6 +827,7 @@ test-suite test-pandoc Tests.Readers.Muse Tests.Readers.Creole Tests.Readers.Man + Tests.Readers.Mdoc Tests.Readers.FB2 Tests.Readers.DokuWiki Tests.Writers.Native diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs new file mode 100644 index 000000000000..24a535f4ae5f --- /dev/null +++ b/test/Tests/Readers/Mdoc.hs @@ -0,0 +1,71 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Tests.Readers.Mdoc + Copyright : © 2024 Evan Silberman + License : GNU GPL, version 2 or above + + Maintainer : + Stability : alpha + Portability : portable + +Tests for the Mdoc reader. +-} + +module Tests.Readers.Mdoc (tests) where + +import Data.Text (Text) +import qualified Data.Text as T +import Test.Tasty +import Test.Tasty.HUnit (HasCallStack) +import Tests.Helpers +import Text.Pandoc +import Text.Pandoc.Arbitrary () +import Text.Pandoc.Builder + +mdoc :: Text -> Pandoc +mdoc = purely $ readMdoc def + +infix 4 =: +(=:) :: (ToString c, HasCallStack) + => String -> (Text, c) -> TestTree +(=:) = test mdoc + +tests :: [TestTree] +tests = [ + testGroup "one-line enclosures" + [ "Dq" =: + ".Dq hello world" =?> + para (doubleQuoted "hello world") + , "Sq" =: + ".Sq hello world" =?> + para (singleQuoted "hello world") + ] + , testGroup "inlines" + [ "Sy" =: + ".Sy hello world" =?> + para (strong "hello world") + , "delimiters" =: + ".Sy ( hello world )" =?> + para (mconcat ["(", strong "hello world", ")"]) + , "multiple" =: + ".Sy hello Em world" =?> + para (strong "hello" <> space <> emph "world") + ] + , testGroup "Ns macro" + [ "at the beginning of a macro line (mandoc delta)" =: + T.unlines [".Op before", ".Ns Op after"] =?> + para "[before][after]" + , "after a block closing macro" =: + T.unlines [".Oo before", ".Oc Ns Op after"] =?> + para "[before][after]" + , "in the middle of a macro line" =: + ".Oo before Oc Ns Op after" =?> + para "[before][after]" + , "at the end of a macro line" =: + T.unlines [".Oo before Oc Ns", ".Op after"] =?> + para "[before][after]" + , "at the end of a partial-implicit line" =: + T.unlines [".Op before Ns", ".Op after"] =?> + para "[before][after]" + ] + ] diff --git a/test/test-pandoc.hs b/test/test-pandoc.hs index d8c78276b3ec..6d749b84549f 100644 --- a/test/test-pandoc.hs +++ b/test/test-pandoc.hs @@ -29,6 +29,7 @@ import qualified Tests.Readers.RST import qualified Tests.Readers.RTF import qualified Tests.Readers.Txt2Tags import qualified Tests.Readers.Man +import qualified Tests.Readers.Mdoc import qualified Tests.Shared import qualified Tests.Writers.AsciiDoc import qualified Tests.Writers.ConTeXt @@ -97,6 +98,7 @@ tests pandocPath = testGroup "pandoc tests" , testGroup "Muse" Tests.Readers.Muse.tests , testGroup "Creole" Tests.Readers.Creole.tests , testGroup "Man" Tests.Readers.Man.tests + , testGroup "Mdoc" Tests.Readers.Mdoc.tests , testGroup "FB2" Tests.Readers.FB2.tests , testGroup "DokuWiki" Tests.Readers.DokuWiki.tests ] From 12f4c49ae77032949b46f28cccc09fef79080804 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 21:10:20 -0700 Subject: [PATCH 40/81] Use lineEnclosure to parse Sh/Ss --- src/Text/Pandoc/Readers/Mdoc.hs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 8977e3d6b3f3..e1cf9970aead 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -185,8 +185,9 @@ shToSectionMode _ = ShOther parseHeader :: PandocMonad m => MdocParser m Blocks parseHeader = do - (Macro m _) <- macro "Sh" <|> macro "Ss" - txt <- argsToInlines + (Macro m _) <- lookAhead $ macro "Sh" <|> macro "Ss" + txt <- lineEnclosure m id + eol let lvl = if m == "Sh" then 1 else 2 when (lvl == 1) $ modifyState $ \s -> s{currentSection = (shToSectionMode . stringify) txt} return $ B.header lvl txt From 3691891c46a4322f13c76889158a2699a0abdd84 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 21:10:31 -0700 Subject: [PATCH 41/81] Add Oo/Op macro --- src/Text/Pandoc/Readers/Mdoc.hs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index e1cf9970aead..ea6639ed9627 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -349,6 +349,9 @@ parseBq = lineEnclosure "Bq" $ \x -> "[" <> x <> "]" parseOp :: PandocMonad m => MdocParser m Inlines parseOp = lineEnclosure "Op" $ \x -> "[" <> x <> "]" +parseOo :: PandocMonad m => MdocParser m Inlines +parseOo = multilineEnclosure "Oo" "Oc" $ \x -> "[" <> x <> "]" + parseBrq :: PandocMonad m => MdocParser m Inlines parseBrq = lineEnclosure "Brq" $ \x -> "{" <> x <> "}" @@ -405,6 +408,7 @@ parseInlineMacro = parseBrq, parseAq, parseDo, + parseOo, parseNs ] From f8d263fe910a63b20070aa60d6c86bbacd1f8f34 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 21:14:08 -0700 Subject: [PATCH 42/81] Handle Ns at end of control line correctly --- src/Text/Pandoc/Readers/Mdoc.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index ea6639ed9627..25d0618056b2 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -270,7 +270,7 @@ lineEnclosure nm xform = do (first, further, finally) <- delimitedArgs (manyTill (parseInlineMacro <|> (try (litsAndDelimsToText <* notFollowedBy eol)) - <|> litsToText) (lookAhead (many (delim Close) *> eol))) + <|> litsToText) (lookAhead (many (macro "Ns" <|> delim Close) *> eol))) return $ first <> xform (spacify further) <> finally noSpace :: Inlines From e6d6dd97cc72e00777f81c8e4401b576d72fdcf3 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 21:39:49 -0700 Subject: [PATCH 43/81] Add the No macro --- src/Text/Pandoc/Readers/Mdoc.hs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 25d0618056b2..6803fe6f2b91 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -322,6 +322,9 @@ parseSy = simpleInline "Sy" (eliminateEmpty B.strong) parseEm :: PandocMonad m => MdocParser m Inlines parseEm = simpleInline "Em" (eliminateEmpty B.emph) +parseNo :: PandocMonad m => MdocParser m Inlines +parseNo = simpleInline "No" (eliminateEmpty id) + parseQl :: PandocMonad m => MdocParser m Inlines parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify @@ -396,6 +399,7 @@ parseInlineMacro = choice [ parseSy, parseEm, + parseNo, parseNm, parseXr, parseQl, From d319de0c5b1ce9a17fa6ada0f365e0793b01b9e8 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 21:40:21 -0700 Subject: [PATCH 44/81] Lex quoted callable macros as macros --- src/Text/Pandoc/Readers/Mdoc/Lex.hs | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs index fe50f64f02ac..b49a6e7845ca 100644 --- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -146,22 +146,30 @@ quoteChar = T.singleton <$> char '"' mdocToken :: PandocMonad m => Lexer m MdocTokens mdocToken = lexComment <|> lexControlLine <|> lexTextLine +lexMacroName :: PandocMonad m => Lexer m T.Text +lexMacroName = many1Char (satisfy isMacroChar) + where + isMacroChar '%' = True + isMacroChar x = isAlphaNum x + lexMacro :: PandocMonad m => Lexer m MdocToken lexMacro = do pos <- getPosition - name <- many1Char (satisfy isMacroChar) + name <- lexMacroName eof <|> void (lookAhead (spaceChar <|> newline)) skipSpaces return $ Macro name pos - where - isMacroChar '%' = True - isMacroChar x = isAlphaNum x lexCallableMacro :: PandocMonad m => Lexer m MdocToken lexCallableMacro = do - m@(Macro name _) <- lexMacro + pos <- getPosition + q <- optionMaybe quoteChar + name <- lexMacroName + when (isJust q) (void quoteChar) + eof <|> void (lookAhead (spaceChar <|> newline)) + skipSpaces guard $ isCallableMacro name - return m + return $ Macro name pos lexDelim :: PandocMonad m => Lexer m MdocToken lexDelim = do From 30e3fce90c50f35597c8e2b6013e8edb28443689 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 21:40:59 -0700 Subject: [PATCH 45/81] Convert some more Ns tests from mandoc --- test/Tests/Readers/Mdoc.hs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index 24a535f4ae5f..b45a85518773 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -61,11 +61,26 @@ tests = [ , "in the middle of a macro line" =: ".Oo before Oc Ns Op after" =?> para "[before][after]" + , "before closing punctuation" =: + ".Oo before Oc Ns : Op after" =?> + para "[before]: [after]" + , "after closing punctuation" =: + ".Oo before Oc : Ns Op after" =?> + para "[before]:[after]" , "at the end of a macro line" =: T.unlines [".Oo before Oc Ns", ".Op after"] =?> para "[before][after]" , "at the end of a partial-implicit line" =: T.unlines [".Op before Ns", ".Op after"] =?> para "[before][after]" + , "normal words" =: + ".No no Ns ns No no" =?> + para ("nons" <> space <> "no") + , "opening punctuation" =: + ".No no Ns \"(\" ns No no" =?> + para ("no(ns" <> space <> "no") + , "closing punctuation" =: + ".No no \"Ns\" ns \")\" No no" =?> + para ("nons)" <> space <> "no") ] ] From 608bbb2e8c949ccd4bb709fb041bed6fe8ba7c33 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 22:15:47 -0700 Subject: [PATCH 46/81] Add more mandoc regress tests --- test/Tests/Readers/Mdoc.hs | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index b45a85518773..c2b11930fa55 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -39,6 +39,9 @@ tests = [ , "Sq" =: ".Sq hello world" =?> para (singleQuoted "hello world") + , "empty" =: + ".Dq" =?> + para (doubleQuoted mempty) ] , testGroup "inlines" [ "Sy" =: @@ -54,7 +57,7 @@ tests = [ , testGroup "Ns macro" [ "at the beginning of a macro line (mandoc delta)" =: T.unlines [".Op before", ".Ns Op after"] =?> - para "[before][after]" + para "[before][after]" -- mandoc: warning + "[before] [after]" , "after a block closing macro" =: T.unlines [".Oo before", ".Oc Ns Op after"] =?> para "[before][after]" @@ -63,7 +66,7 @@ tests = [ para "[before][after]" , "before closing punctuation" =: ".Oo before Oc Ns : Op after" =?> - para "[before]: [after]" + para "[before]: [after]" -- mandoc: warning , "after closing punctuation" =: ".Oo before Oc : Ns Op after" =?> para "[before]:[after]" @@ -83,4 +86,32 @@ tests = [ ".No no \"Ns\" ns \")\" No no" =?> para ("nons)" <> space <> "no") ] + , testGroup "inline punctuation" + [ testGroup "leading punctuation" + [ "open paren" =: ".Em ( b" =?> para ("(" <> emph "b") + , "open square bracket" =: ".Em \"[\" b" =?> para ("[" <> emph "b") + -- , "pipe" =: ".Em | b" =?> para ("|" <> space <> emph "b") + , "period" =: ".Em . b" =?> para ("." <> space <> emph "b") + , "comma" =: ".Em , b" =?> para ("," <> space <> emph "b") + , "semicolon" =: ".Em ; b" =?> para (";" <> space <> emph "b") + , "colon" =: ".Em : b" =?> para (":" <> space <> emph "b") + , "question mark" =: ".Em ? b" =?> para ("?" <> space <> emph "b") + , "exclamation" =: ".Em ! b" =?> para ("!" <> space <> emph "b") + , "close paren" =: ".Em ) b" =?> para (")" <> space <> emph "b") + , "close square bracket" =: ".Em \"]\" b" =?> para ("]" <> space <> emph "b") + ] + , testGroup "trailing punctuation" + [ "open paren" =: ".Em a (" =?> para (emph "a" <> space <> "(") + , "open square bracket" =: ".Em a [" =?> para (emph "a" <> space <> "[") + -- , "pipe" =: ".Em a |" =?> para (emph "a" <> space <> "|") + , "period" =: ".Em a ." =?> para (emph "a" <> ".") + , "comma" =: ".Em a ," =?> para (emph "a" <> ",") + , "semicolon" =: ".Em a ;" =?> para (emph "a" <> ";") + , "colon" =: ".Em a :" =?> para (emph "a" <> ":") + , "question mark" =: ".Em a ?" =?> para (emph "a" <> "?") + , "exclamation" =: ".Em a !" =?> para (emph "a" <> "!") + , "close parens" =: ".Em a \")\"" =?> para (emph "a" <> ")") + , "close square bracket" =: ".Em a ]" =?> para (emph "a" <> "]") + ] + ] ] From bb3db9c72d75b75f54d702a44ca5be6d45338ca4 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 22:39:41 -0700 Subject: [PATCH 47/81] Add remaining multiline enclosures --- src/Text/Pandoc/Readers/Mdoc.hs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 6803fe6f2b91..7990a168eaf5 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -337,15 +337,27 @@ parseDo = multilineEnclosure "Do" "Dc" B.doubleQuoted parseSq :: PandocMonad m => MdocParser m Inlines parseSq = lineEnclosure "Sq" B.singleQuoted +parseSo :: PandocMonad m => MdocParser m Inlines +parseSo = multilineEnclosure "So" "Sc" B.singleQuoted + parseQq :: PandocMonad m => MdocParser m Inlines parseQq = lineEnclosure "Qq" $ \x -> "\"" <> x <> "\"" +parseQo :: PandocMonad m => MdocParser m Inlines +parseQo = multilineEnclosure "Qo" "Qc" $ \x -> "\"" <> x <> "\"" + parsePq :: PandocMonad m => MdocParser m Inlines parsePq = lineEnclosure "Pq" $ \x -> "(" <> x <> ")" +parsePo :: PandocMonad m => MdocParser m Inlines +parsePo = multilineEnclosure "Po" "Pc" $ \x -> "(" <> x <> ")" + parseBq :: PandocMonad m => MdocParser m Inlines parseBq = lineEnclosure "Bq" $ \x -> "[" <> x <> "]" +parseBo :: PandocMonad m => MdocParser m Inlines +parseBo = multilineEnclosure "Bo" "Bc" $ \x -> "[" <> x <> "]" + -- For our purposes this probably behaves identically to Bq -- in most circumstances but I might need to do something -- special with it in SYNOPSIS @@ -358,9 +370,15 @@ parseOo = multilineEnclosure "Oo" "Oc" $ \x -> "[" <> x <> "]" parseBrq :: PandocMonad m => MdocParser m Inlines parseBrq = lineEnclosure "Brq" $ \x -> "{" <> x <> "}" +parseBro :: PandocMonad m => MdocParser m Inlines +parseBro = multilineEnclosure "Bro" "Brc" $ \x -> "{" <> x <> "}" + parseAq :: PandocMonad m => MdocParser m Inlines parseAq = lineEnclosure "Aq" $ \x -> "⟨" <> x <> "⟩" +parseAo :: PandocMonad m => MdocParser m Inlines +parseAo = multilineEnclosure "Ao" "Ac" $ \x -> "⟨" <> x <> "⟩" + parseNm :: PandocMonad m => MdocParser m Inlines parseNm = do mnm <- (progName <$> getState) @@ -411,7 +429,13 @@ parseInlineMacro = parseBq, parseBrq, parseAq, + parseSo, parseDo, + parseQo, + parsePo, + parseBo, + parseBro, + parseAo, parseOo, parseNs ] From 07ea56e051c5debb16c6f9cb2dec487d9c41302f Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 23:07:01 -0700 Subject: [PATCH 48/81] Fix actual multiline enclosures Consumption of eol on the line for the opening macro got lost. Add a couple tests. Some irrelevant tests in this commit. Sorry!! --- src/Text/Pandoc/Readers/Mdoc.hs | 9 +++++++-- test/Tests/Readers/Mdoc.hs | 17 +++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 7990a168eaf5..9deef65d168a 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -304,8 +304,13 @@ but pandoc inlines inside of these multiline enclosures. -} multilineEnclosure :: PandocMonad m => T.Text -> T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines multilineEnclosure op cl xform = do macro op - (first, further, finally) <- delimitedArgs (manyTill parseInlines (macro cl)) - return $ first <> xform (spacify further) <> finally + openDelim <- mconcat <$> many (parseDelim Open) + optional eol + contents <- many parseInline + (macro cl show cl) + closeDelim <- mconcat <$> many (parseDelim Close) + optional eol + return $ openDelim <> xform (spacify contents) <> closeDelim eliminateEmpty :: (Inlines -> Inlines) -> Inlines -> Inlines eliminateEmpty x y = if null y then mempty else x y diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index c2b11930fa55..12b53c26e71d 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -42,11 +42,28 @@ tests = [ , "empty" =: ".Dq" =?> para (doubleQuoted mempty) + , "nested" =: + ".Dq Pq hello world" =?> + para (doubleQuoted "(hello world)") + , "with inlines" =: + ".Dq hello Sy world ." =?> + para (doubleQuoted ("hello" <> space <> strong "world" <> ".")) + ] + , testGroup "multiline enclosures" + [ "nested multiline" =: + T.unlines [".Bo", ".Po", "hi", ".Pc", ".Bc"] =?> + para ("[(hi)]") + , "nested on one line" =: + ".Bo Po hi Pc Bc" =?> + para ("[(hi)]") ] , testGroup "inlines" [ "Sy" =: ".Sy hello world" =?> para (strong "hello world") + , "Em" =: + ".Em hello world" =?> + para (emph "hello world") , "delimiters" =: ".Sy ( hello world )" =?> para (mconcat ["(", strong "hello world", ")"]) From ee6be32434be8143bcbcd06ac7b764c8bb187520 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 23:08:06 -0700 Subject: [PATCH 49/81] Remove YAGNI parseLit I will probably actually need it later, but not now --- src/Text/Pandoc/Readers/Mdoc.hs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 9deef65d168a..b7e3f5eccc7e 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -219,11 +219,6 @@ parseStr = do (Str txt _) <- str return $ B.str txt -parseLit :: PandocMonad m => MdocParser m Inlines -parseLit = do - (Lit txt _) <- lit - return $ B.str txt - parseDelim :: PandocMonad m => DelimSide -> MdocParser m Inlines parseDelim pos = do (Delim _ txt _) <- delim pos From ebef1a2fc0eacfad3dac78acaa74e19646c9ece6 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Sun, 15 Sep 2024 23:24:16 -0700 Subject: [PATCH 50/81] fixup: remaining multiline enclosures --- src/Text/Pandoc/Readers/Mdoc.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index b7e3f5eccc7e..6eb6c30436d6 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -344,7 +344,7 @@ parseQq :: PandocMonad m => MdocParser m Inlines parseQq = lineEnclosure "Qq" $ \x -> "\"" <> x <> "\"" parseQo :: PandocMonad m => MdocParser m Inlines -parseQo = multilineEnclosure "Qo" "Qc" $ \x -> "\"" <> x <> "\"" +parseQo = multilineEnclosure "Qo" "Qc" $ \x -> "\"" <> x <> "\"" parsePq :: PandocMonad m => MdocParser m Inlines parsePq = lineEnclosure "Pq" $ \x -> "(" <> x <> ")" From 8664d8e9314592ed3ee318babd80e34c82eba5ad Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Mon, 16 Sep 2024 10:14:18 -0700 Subject: [PATCH 51/81] Handle middle delimiters (i.e. the pipe) --- src/Text/Pandoc/Readers/Mdoc.hs | 17 ++++++++++------- src/Text/Pandoc/Readers/Mdoc/Lex.hs | 7 +++++-- test/Tests/Readers/Mdoc.hs | 4 ++-- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 6eb6c30436d6..1b75f57d952b 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -232,20 +232,23 @@ litsToText = do litsAndDelimsToText :: PandocMonad m => MdocParser m Inlines litsAndDelimsToText = do - ods <- mconcat <$> many (parseDelim Open) - ls <- many lit - cds <- mconcat <$> if null ods && null ls - then many1 (parseDelim Close) - else many (parseDelim Close) + (o, ls, c) <- delimitedArgs $ many lit + guard $ not (null o && null ls && null c) let strs = map (B.str . toString) ls - return $ ods <> spacify strs <> cds + return $ o <> spacify strs <> c delimitedArgs :: PandocMonad m => MdocParser m x -> MdocParser m (Inlines, x, Inlines) delimitedArgs p = do openDelim <- mconcat <$> many (parseDelim Open) + omids <- spacify <$> many (parseDelim Middle) + let omid | null omids = mempty + | otherwise = omids <> B.space inlines <- p + cmids <- spacify <$> many (parseDelim Middle) + let cmid | null cmids = mempty + | otherwise = B.space <> cmids closeDelim <- mconcat <$> many (parseDelim Close) - return (openDelim, inlines, closeDelim) + return (openDelim <> omid, inlines, cmid <> closeDelim) -- TODO extract further? simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs index b49a6e7845ca..3cf741101a02 100644 --- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -171,11 +171,14 @@ lexCallableMacro = do guard $ isCallableMacro name return $ Macro name pos -lexDelim :: PandocMonad m => Lexer m MdocToken +lexDelim :: (PandocMonad m) => Lexer m MdocToken lexDelim = do pos <- getPosition q <- optionMaybe quoteChar - t <- Delim Open <$> oneOfStrings ["(", "["] <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"] + t <- + Delim Open <$> oneOfStrings ["(", "["] + <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"] + <|> Delim Middle <$> textStr "|" when (isJust q) (void quoteChar) eof <|> void (lookAhead (spaceChar <|> newline)) skipSpaces diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index 12b53c26e71d..ba671d871a9b 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -107,7 +107,7 @@ tests = [ [ testGroup "leading punctuation" [ "open paren" =: ".Em ( b" =?> para ("(" <> emph "b") , "open square bracket" =: ".Em \"[\" b" =?> para ("[" <> emph "b") - -- , "pipe" =: ".Em | b" =?> para ("|" <> space <> emph "b") + , "pipe" =: ".Em | b" =?> para ("|" <> space <> emph "b") , "period" =: ".Em . b" =?> para ("." <> space <> emph "b") , "comma" =: ".Em , b" =?> para ("," <> space <> emph "b") , "semicolon" =: ".Em ; b" =?> para (";" <> space <> emph "b") @@ -120,7 +120,7 @@ tests = [ , testGroup "trailing punctuation" [ "open paren" =: ".Em a (" =?> para (emph "a" <> space <> "(") , "open square bracket" =: ".Em a [" =?> para (emph "a" <> space <> "[") - -- , "pipe" =: ".Em a |" =?> para (emph "a" <> space <> "|") + , "pipe" =: ".Em a |" =?> para (emph "a" <> space <> "|") , "period" =: ".Em a ." =?> para (emph "a" <> ".") , "comma" =: ".Em a ," =?> para (emph "a" <> ",") , "semicolon" =: ".Em a ;" =?> para (emph "a" <> ";") From 35cd77beb251ac0e2a7961ded4415d0fcdef9de8 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Mon, 16 Sep 2024 10:26:30 -0700 Subject: [PATCH 52/81] Add middle punctuation tests from mandoc regress --- test/Tests/Readers/Mdoc.hs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index ba671d871a9b..be154635aa54 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -130,5 +130,18 @@ tests = [ , "close parens" =: ".Em a \")\"" =?> para (emph "a" <> ")") , "close square bracket" =: ".Em a ]" =?> para (emph "a" <> "]") ] + , testGroup "middle punctuation" + [ "open paren" =: ".Em a ( b" =?> para (mconcat [emph "a", space, "(", emph "b"]) + , "open square bracket" =: ".Em a [ b" =?> para (mconcat [emph "a", space, "[", emph "b"]) + , "pipe" =: ".Em a \"|\" b" =?> para (mconcat [emph "a", space, "|", space, emph "b"]) + , "period" =: ".Em a . b" =?> para (mconcat [emph "a", ".", space, emph "b"]) + , "comma" =: ".Em a , b" =?> para (mconcat [emph "a", ",", space, emph "b"]) + , "semicolon" =: ".Em a ; b" =?> para (mconcat [emph "a", ";", space, emph "b"]) + , "colon" =: ".Em a \":\" b" =?> para (mconcat [emph "a", ":", space, emph "b"]) + , "question mark" =: ".Em a ? b" =?> para (mconcat [emph "a", "?", space, emph "b"]) + , "exclamation" =: ".Em a ! b" =?> para (mconcat [emph "a", "!", space, emph "b"]) + , "close paren" =: ".Em a ) b" =?> para (mconcat [emph "a", ")", space, emph "b"]) + , "close square bracket" =: ".Em a ] b" =?> para (mconcat [emph "a", "]", space, emph "b"]) + ] ] ] From f3b8fee8acf65825af04a438b7224550e5645f85 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Mon, 16 Sep 2024 12:52:45 -0700 Subject: [PATCH 53/81] fixup: Spacing enclosures --- src/Text/Pandoc/Readers/Mdoc.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 1b75f57d952b..f14d88937c2d 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -335,13 +335,13 @@ parseDq :: PandocMonad m => MdocParser m Inlines parseDq = lineEnclosure "Dq" B.doubleQuoted parseDo :: PandocMonad m => MdocParser m Inlines -parseDo = multilineEnclosure "Do" "Dc" B.doubleQuoted +parseDo = multilineEnclosure "Do" "Dc" B.doubleQuoted parseSq :: PandocMonad m => MdocParser m Inlines parseSq = lineEnclosure "Sq" B.singleQuoted parseSo :: PandocMonad m => MdocParser m Inlines -parseSo = multilineEnclosure "So" "Sc" B.singleQuoted +parseSo = multilineEnclosure "So" "Sc" B.singleQuoted parseQq :: PandocMonad m => MdocParser m Inlines parseQq = lineEnclosure "Qq" $ \x -> "\"" <> x <> "\"" From 81504eaeb92e68600bab4abd46092c7c545b6734 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Mon, 16 Sep 2024 13:01:59 -0700 Subject: [PATCH 54/81] Add Lk support for hyperlinks Refactor delimiter parsing to support special needs. Could end up unified again at some point if I think harder. --- src/Text/Pandoc/Readers/Mdoc.hs | 72 ++++++++++++++++++++++++++------- test/Tests/Readers/Mdoc.hs | 13 +++++- 2 files changed, 70 insertions(+), 15 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index f14d88937c2d..e3107b31ac31 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -149,15 +149,18 @@ eol = void $ msatisfy t where t Eol{} = True t _ = False +newControlContext :: MdocToken -> Bool +newControlContext Eol{} = True +newControlContext Macro{} = True +newControlContext Str{} = True +newControlContext Tbl{} = True +newControlContext Blank{} = True +newControlContext Lit{} = False +newControlContext Delim{} = False + + inlineContextEnd :: PandocMonad m => MdocParser m () -inlineContextEnd = eof <|> (void . lookAhead $ msatisfy t) where - t Eol{} = True - t Macro{} = True - t Str{} = True -- shouldn't be lexed - t Tbl{} = True -- shouldn't be lexed - t Blank{} = True -- shouldn't be lexed - t Lit{} = False - t Delim{} = False +inlineContextEnd = eof <|> (void . lookAhead $ msatisfy newControlContext) argsToInlines :: PandocMonad m => MdocParser m Inlines argsToInlines = do @@ -237,18 +240,31 @@ litsAndDelimsToText = do let strs = map (B.str . toString) ls return $ o <> spacify strs <> c -delimitedArgs :: PandocMonad m => MdocParser m x -> MdocParser m (Inlines, x, Inlines) -delimitedArgs p = do +openingDelimiters :: PandocMonad m => MdocParser m Inlines +openingDelimiters = do openDelim <- mconcat <$> many (parseDelim Open) - omids <- spacify <$> many (parseDelim Middle) + omids <- pipes let omid | null omids = mempty | otherwise = omids <> B.space - inlines <- p - cmids <- spacify <$> many (parseDelim Middle) + return $ openDelim <> omid + +pipes :: PandocMonad m => MdocParser m Inlines +pipes = spacify <$> many (parseDelim Middle) + +closingDelimiters :: PandocMonad m => MdocParser m Inlines +closingDelimiters = do + cmids <- pipes let cmid | null cmids = mempty | otherwise = B.space <> cmids closeDelim <- mconcat <$> many (parseDelim Close) - return (openDelim <> omid, inlines, cmid <> closeDelim) + return $ cmid <> closeDelim + +delimitedArgs :: PandocMonad m => MdocParser m x -> MdocParser m (Inlines, x, Inlines) +delimitedArgs p = do + openDelim <- openingDelimiters + inlines <- p + closeDelim <- closingDelimiters + return (openDelim, inlines, closeDelim) -- TODO extract further? simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines @@ -409,6 +425,33 @@ parseXr = do s <- lit "Xr manual section" return (toString n, toString s) +parseLk :: PandocMonad m => MdocParser m Inlines +parseLk = do + macro "Lk" + openClose <- closingDelimiters + openOpen <- openingDelimiters + url <- toString <$> lit + inner <- spacify <$> many segment + close <- closingDelimiters + let label | null inner = B.str url + | otherwise = inner + return $ open openClose openOpen <> B.link url "" label <> close + where + open a b + | null a = b + | null b = a + | otherwise = a <> B.space <> b + end = msatisfy newControlContext + segment = do + a <- openingDelimiters + m <- option mempty litsToText + z <- + try (closingDelimiters <* notFollowedBy end) + <|> option mempty pipes + guard $ not $ all null [a, m, z] + return $ a <> m <> z + + parseNs :: PandocMonad m => MdocParser m Inlines parseNs = macro "Ns" >> return noSpace @@ -420,6 +463,7 @@ parseInlineMacro = choice [ parseSy, parseEm, + parseLk, parseNo, parseNm, parseXr, diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index be154635aa54..85d47cce45c9 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -57,7 +57,7 @@ tests = [ ".Bo Po hi Pc Bc" =?> para ("[(hi)]") ] - , testGroup "inlines" + , testGroup "simple inlines" [ "Sy" =: ".Sy hello world" =?> para (strong "hello world") @@ -71,6 +71,17 @@ tests = [ ".Sy hello Em world" =?> para (strong "hello" <> space <> emph "world") ] + , testGroup "links" + [ "basic" =: + ".Lk href name" =?> + para (link "href" "" "name") + , "complicated" =: + ".Lk , ( href name )" =?> + para ("," <> space <> "(" <> link "href" "" "name" <> ")") + , "unnamed" =: + ".Lk href" =?> + para (link "href" "" "href") + ] , testGroup "Ns macro" [ "at the beginning of a macro line (mandoc delta)" =: T.unlines [".Op before", ".Ns Op after"] =?> From ecbb1dbab3dce6a6c64c8f0339a8513541f0ad10 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Mon, 16 Sep 2024 21:36:14 -0700 Subject: [PATCH 55/81] Add Mt macro --- src/Text/Pandoc/Readers/Mdoc.hs | 9 +++++++++ test/Tests/Readers/Mdoc.hs | 4 ++++ 2 files changed, 13 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index e3107b31ac31..82b759ec2645 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -344,6 +344,14 @@ parseEm = simpleInline "Em" (eliminateEmpty B.emph) parseNo :: PandocMonad m => MdocParser m Inlines parseNo = simpleInline "No" (eliminateEmpty id) +-- I'm not sure why mandoc inserts a ~ when Mt is missing an argument, +-- but it does, and it doesn't issue a warning, so that quirk is +-- retained. +parseMt :: PandocMonad m => MdocParser m Inlines +parseMt = simpleInline "Mt" mailto + where mailto x | null x = B.link ("mailto:~") "" "~" + | otherwise = B.link ("mailto:" <> stringify x) "" x + parseQl :: PandocMonad m => MdocParser m Inlines parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify @@ -464,6 +472,7 @@ parseInlineMacro = [ parseSy, parseEm, parseLk, + parseMt, parseNo, parseNm, parseXr, diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index 85d47cce45c9..b9f0454096fc 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -64,6 +64,10 @@ tests = [ , "Em" =: ".Em hello world" =?> para (emph "hello world") + , "Mt" =: + ".Mt a@example.org , b@example.org" =?> + para ((link "mailto:a@example.org" "" "a@example.org") <> + "," <> space <> (link "mailto:b@example.org" "" "b@example.org")) , "delimiters" =: ".Sy ( hello world )" =?> para (mconcat ["(", strong "hello world", ")"]) From 309e864a23784a67c9b9e08a9803e6ddb1a5bb58 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Mon, 16 Sep 2024 21:37:05 -0700 Subject: [PATCH 56/81] Add .No test --- test/Tests/Readers/Mdoc.hs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index b9f0454096fc..f2e6a152b232 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -68,6 +68,9 @@ tests = [ ".Mt a@example.org , b@example.org" =?> para ((link "mailto:a@example.org" "" "a@example.org") <> "," <> space <> (link "mailto:b@example.org" "" "b@example.org")) + , "No" =: + ".No ( hello , world ! )" =?> + para "(hello, world!)" , "delimiters" =: ".Sy ( hello world )" =?> para (mconcat ["(", strong "hello world", ")"]) From 2f287508675ac2b9f50675885efcba9039f3bc95 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Wed, 18 Sep 2024 18:03:04 -0700 Subject: [PATCH 57/81] Add Pa (path) macro --- src/Text/Pandoc/Readers/Mdoc.hs | 6 ++++++ test/Tests/Readers/Mdoc.hs | 3 +++ 2 files changed, 9 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 82b759ec2645..a1068015a075 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -352,6 +352,11 @@ parseMt = simpleInline "Mt" mailto where mailto x | null x = B.link ("mailto:~") "" "~" | otherwise = B.link ("mailto:" <> stringify x) "" x +parsePa :: PandocMonad m => MdocParser m Inlines +parsePa = simpleInline "Pa" p + where p x | null x = B.spanWith (cls "Pa") "~" + | otherwise = B.spanWith (cls "Pa") x + parseQl :: PandocMonad m => MdocParser m Inlines parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify @@ -473,6 +478,7 @@ parseInlineMacro = parseEm, parseLk, parseMt, + parsePa, parseNo, parseNm, parseXr, diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index f2e6a152b232..9f79d3f2e123 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -71,6 +71,9 @@ tests = [ , "No" =: ".No ( hello , world ! )" =?> para "(hello, world!)" + , "empty Pa with closing punctuation" =: + ".Pa ) z" =?> + para (spanWith (mempty, ["Pa"], mempty) "~" <> ")" <> space <> spanWith (mempty, ["Pa"], mempty) "z") , "delimiters" =: ".Sy ( hello world )" =?> para (mconcat ["(", strong "hello world", ")"]) From f921ba82dd44bbcf5eada60fa58790cdca98370c Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 19 Sep 2024 16:13:16 -0700 Subject: [PATCH 58/81] Add Fl, Ev, Cm and tests --- src/Text/Pandoc/Readers/Mdoc.hs | 46 +++++++++++++++++++++++++++------ test/Tests/Readers/Mdoc.hs | 25 +++++++++++++++++- 2 files changed, 62 insertions(+), 9 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index a1068015a075..be0092c34e9a 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -227,14 +227,19 @@ parseDelim pos = do (Delim _ txt _) <- delim pos return $ B.str txt -litsToText :: PandocMonad m => MdocParser m Inlines +litsToText :: PandocMonad m => MdocParser m [T.Text] litsToText = do + ls <- many1 lit + return $ map toString ls + +litsToInlines :: PandocMonad m => MdocParser m Inlines +litsToInlines = do ls <- many1 lit let strs = map (B.str . toString) ls return $ spacify strs -litsAndDelimsToText :: PandocMonad m => MdocParser m Inlines -litsAndDelimsToText = do +litsAndDelimsToInlines :: PandocMonad m => MdocParser m Inlines +litsAndDelimsToInlines = do (o, ls, c) <- delimitedArgs $ many lit guard $ not (null o && null ls && null c) let strs = map (B.str . toString) ls @@ -272,6 +277,16 @@ simpleInline nm xform = do macro nm segs <- manyTill segment inlineContextEnd return $ spacify segs + where + segment = do + (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToInlines + return $ openDelim <> xform inlines <> closeDelim + +argsInline :: PandocMonad m => T.Text -> ([T.Text] -> Inlines) -> MdocParser m Inlines +argsInline nm xform = do + macro nm + segs <- manyTill segment inlineContextEnd + return $ spacify segs where segment = do (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText @@ -283,8 +298,8 @@ lineEnclosure nm xform = do --- XXX wtf (first, further, finally) <- delimitedArgs (manyTill - (parseInlineMacro <|> (try (litsAndDelimsToText <* notFollowedBy eol)) - <|> litsToText) (lookAhead (many (macro "Ns" <|> delim Close) *> eol))) + (parseInlineMacro <|> (try (litsAndDelimsToInlines <* notFollowedBy eol)) + <|> litsToInlines) (lookAhead (many (macro "Ns" <|> delim Close) *> eol))) return $ first <> xform (spacify further) <> finally noSpace :: Inlines @@ -344,6 +359,9 @@ parseEm = simpleInline "Em" (eliminateEmpty B.emph) parseNo :: PandocMonad m => MdocParser m Inlines parseNo = simpleInline "No" (eliminateEmpty id) +parseEv :: PandocMonad m => MdocParser m Inlines +parseEv = simpleInline "Ev" (eliminateEmpty (B.codeWith (cls "Ev") . stringify)) + -- I'm not sure why mandoc inserts a ~ when Mt is missing an argument, -- but it does, and it doesn't issue a warning, so that quirk is -- retained. @@ -357,6 +375,14 @@ parsePa = simpleInline "Pa" p where p x | null x = B.spanWith (cls "Pa") "~" | otherwise = B.spanWith (cls "Pa") x +parseFl :: PandocMonad m => MdocParser m Inlines +parseFl = argsInline "Fl" (spacify . fl . flags) + where fl = map $ B.codeWith (cls "Fl") + flags = map ("-" <>) + +parseCm :: PandocMonad m => MdocParser m Inlines +parseCm = simpleInline "Cm" $ B.codeWith (cls "Cm") . stringify + parseQl :: PandocMonad m => MdocParser m Inlines parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify @@ -416,7 +442,7 @@ parseNm = do mnm <- (progName <$> getState) case mnm of Nothing -> do - (_, nm, _) <- lookAhead $ delimitedArgs $ option mempty litsToText + (_, nm, _) <- lookAhead $ delimitedArgs $ option mempty litsToInlines guard $ not (null nm) simpleInline "Nm" ok Just nm -> simpleInline "Nm" $ \x -> @@ -438,6 +464,7 @@ parseXr = do s <- lit "Xr manual section" return (toString n, toString s) +-- TODO: can I rewrite this with argsInline? parseLk :: PandocMonad m => MdocParser m Inlines parseLk = do macro "Lk" @@ -457,7 +484,7 @@ parseLk = do end = msatisfy newControlContext segment = do a <- openingDelimiters - m <- option mempty litsToText + m <- option mempty litsToInlines z <- try (closingDelimiters <* notFollowedBy end) <|> option mempty pipes @@ -477,8 +504,11 @@ parseInlineMacro = [ parseSy, parseEm, parseLk, + parseEv, parseMt, parsePa, + parseFl, + parseCm, parseNo, parseNm, parseXr, @@ -504,7 +534,7 @@ parseInlineMacro = parseInline :: PandocMonad m => MdocParser m Inlines parseInline = parseStr <|> - ((parseInlineMacro <|> litsAndDelimsToText) <* optional eol) + ((parseInlineMacro <|> litsAndDelimsToInlines) <* optional eol) -- TODO probably need some kind of fold to deal with Ns diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index 9f79d3f2e123..537952bbff4c 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -30,6 +30,9 @@ infix 4 =: => String -> (Text, c) -> TestTree (=:) = test mdoc +cls :: Text -> Attr +cls x = (mempty, [x], mempty) + tests :: [TestTree] tests = [ testGroup "one-line enclosures" @@ -39,6 +42,9 @@ tests = [ , "Sq" =: ".Sq hello world" =?> para (singleQuoted "hello world") + , "Ev" =: + ".Ev HELLO_WORLD ," =?> + para (codeWith (cls "Ev") "HELLO_WORLD" <> ",") , "empty" =: ".Dq" =?> para (doubleQuoted mempty) @@ -73,7 +79,7 @@ tests = [ para "(hello, world!)" , "empty Pa with closing punctuation" =: ".Pa ) z" =?> - para (spanWith (mempty, ["Pa"], mempty) "~" <> ")" <> space <> spanWith (mempty, ["Pa"], mempty) "z") + para (spanWith (cls "Pa") "~" <> ")" <> space <> spanWith (mempty, ["Pa"], mempty) "z") , "delimiters" =: ".Sy ( hello world )" =?> para (mconcat ["(", strong "hello world", ")"]) @@ -81,6 +87,23 @@ tests = [ ".Sy hello Em world" =?> para (strong "hello" <> space <> emph "world") ] + , testGroup "Fl" + [ "simple" =: + ".Fl w" =?> + para (codeWith (cls "Fl") "-w") + , "multiple" =: + ".Fl W all" =?> + para (codeWith (cls "Fl") "-W" <> space <> codeWith (cls "Fl") "-all") + , "GNU" =: + ".Fl -help" =?> + para (codeWith (cls "Fl") "--help") + , "GNU escaped" =: + ".Fl \\-help" =?> + para (codeWith (cls "Fl") "--help") + , "punctuation" =: + ".Op Fl a | b" =?> + para ("[" <> codeWith (cls "Fl") "-a" <> " | " <> codeWith (cls "Fl") "-b" <> "]") + ] , testGroup "links" [ "basic" =: ".Lk href name" =?> From 79c34c948fe8e76a5ec160c1f55139a19fd3f262 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 19 Sep 2024 16:17:17 -0700 Subject: [PATCH 59/81] fixup Ev: Move test to the right section --- test/Tests/Readers/Mdoc.hs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index 537952bbff4c..1c7f6a9dde39 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -42,9 +42,6 @@ tests = [ , "Sq" =: ".Sq hello world" =?> para (singleQuoted "hello world") - , "Ev" =: - ".Ev HELLO_WORLD ," =?> - para (codeWith (cls "Ev") "HELLO_WORLD" <> ",") , "empty" =: ".Dq" =?> para (doubleQuoted mempty) @@ -70,6 +67,9 @@ tests = [ , "Em" =: ".Em hello world" =?> para (emph "hello world") + , "Ev" =: + ".Ev HELLO_WORLD ," =?> + para (codeWith (cls "Ev") "HELLO_WORLD" <> ",") , "Mt" =: ".Mt a@example.org , b@example.org" =?> para ((link "mailto:a@example.org" "" "a@example.org") <> From 86af56880ca4274467095d26662dc52830f6f2ef Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Fri, 20 Sep 2024 12:02:42 -0700 Subject: [PATCH 60/81] Refine Fl parser There's a number of unique-looking cases for Fl parsing so I am just handling them very explicitly instead of trying to generalize anything enough to handle it. --- src/Text/Pandoc/Readers/Mdoc.hs | 40 +++++++++++++++++++++++---------- test/Tests/Readers/Mdoc.hs | 18 +++++++++++++++ 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index be0092c34e9a..9d89785f1b22 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -14,6 +14,7 @@ Conversion of mdoc to 'Pandoc' document. module Text.Pandoc.Readers.Mdoc (readMdoc) where import Data.Default (Default) +import Data.Functor (($>)) import Control.Monad (mplus, guard, void, when) import Control.Monad.Except (throwError) import Data.List (intersperse) @@ -110,6 +111,11 @@ macro name = msatisfy t where t (Macro n _) = n == name t _ = False +anyMacro :: PandocMonad m => MdocParser m MdocToken +anyMacro = msatisfy t where + t (Macro _ _) = True + t _ = False + emptyMacro :: PandocMonad m => T.Text -> MdocParser m MdocToken emptyMacro n = macro n <* eol @@ -282,15 +288,6 @@ simpleInline nm xform = do (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToInlines return $ openDelim <> xform inlines <> closeDelim -argsInline :: PandocMonad m => T.Text -> ([T.Text] -> Inlines) -> MdocParser m Inlines -argsInline nm xform = do - macro nm - segs <- manyTill segment inlineContextEnd - return $ spacify segs - where - segment = do - (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText - return $ openDelim <> xform inlines <> closeDelim lineEnclosure :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines lineEnclosure nm xform = do @@ -376,9 +373,28 @@ parsePa = simpleInline "Pa" p | otherwise = B.spanWith (cls "Pa") x parseFl :: PandocMonad m => MdocParser m Inlines -parseFl = argsInline "Fl" (spacify . fl . flags) - where fl = map $ B.codeWith (cls "Fl") - flags = map ("-" <>) +parseFl = do + macro "Fl" + start <- option mempty (emptyWithDelim <|> emptyWithMacro <|> emptyEmpty) + segs <- manyTill segment inlineContextEnd + return $ spacify ([start] <> segs) + where + emptyWithDelim = do + lookAhead $ many1 (delim Middle <|> delim Close) + ds <- closingDelimiters + return $ fl "-" <> ds + emptyWithMacro = do + lookAhead anyMacro + rest <- parseInlines + return $ fl "-" <> rest + emptyEmpty = lookAhead eol $> fl "-" + segment = do + (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText + return $ openDelim <> (spacify . (map fl) . flags) inlines <> closeDelim + fl = B.codeWith (cls "Fl") + flags [] = ["-"] + flags xs = map ("-" <>) xs + parseCm :: PandocMonad m => MdocParser m Inlines parseCm = simpleInline "Cm" $ B.codeWith (cls "Cm") . stringify diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index 1c7f6a9dde39..a35f2fd9ca13 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -94,6 +94,12 @@ tests = [ , "multiple" =: ".Fl W all" =?> para (codeWith (cls "Fl") "-W" <> space <> codeWith (cls "Fl") "-all") + , "empty with following macro" =: + ".Fl Cm x" =?> + para (codeWith (cls "Fl") "-" <> codeWith (cls "Cm") "x") + , "following Ns" =: + ".Fl W Ns Cm all" =?> + para (codeWith (cls "Fl") "-W" <> codeWith (cls "Cm") "all") , "GNU" =: ".Fl -help" =?> para (codeWith (cls "Fl") "--help") @@ -103,6 +109,18 @@ tests = [ , "punctuation" =: ".Op Fl a | b" =?> para ("[" <> codeWith (cls "Fl") "-a" <> " | " <> codeWith (cls "Fl") "-b" <> "]") + , "middle close paren" =: + ".Fl a ) z" =?> + para (codeWith (cls "Fl") "-a" <> ") " <> codeWith (cls "Fl") "-z") + , "empty with close paren" =: + ".Fl ) z" =?> + para (codeWith (cls "Fl") "-" <> ") " <> codeWith (cls "Fl") "-z") + , "empty with pipe" =: + ".Fl | z" =?> + para (codeWith (cls "Fl") "-" <> " | " <> codeWith (cls "Fl") "-z") + , "empty with parens" =: + ".Fl ( )" =?> + para ("(" <> codeWith (cls "Fl") "-" <> ")") ] , testGroup "links" [ "basic" =: From 5a2196e1c17fa1e242b9c9d133754ed3bdf01941 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Fri, 20 Sep 2024 14:39:20 -0700 Subject: [PATCH 61/81] Extract helper for inline macros that produce Code --- src/Text/Pandoc/Readers/Mdoc.hs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 9d89785f1b22..c32be9b05da2 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -288,6 +288,9 @@ simpleInline nm xform = do (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToInlines return $ openDelim <> xform inlines <> closeDelim +codeLikeInline :: PandocMonad m => T.Text -> MdocParser m Inlines +codeLikeInline nm = do + simpleInline nm (eliminateEmpty (B.codeWith (cls nm) . stringify)) lineEnclosure :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines lineEnclosure nm xform = do @@ -357,7 +360,7 @@ parseNo :: PandocMonad m => MdocParser m Inlines parseNo = simpleInline "No" (eliminateEmpty id) parseEv :: PandocMonad m => MdocParser m Inlines -parseEv = simpleInline "Ev" (eliminateEmpty (B.codeWith (cls "Ev") . stringify)) +parseEv = codeLikeInline "Ev" -- I'm not sure why mandoc inserts a ~ when Mt is missing an argument, -- but it does, and it doesn't issue a warning, so that quirk is @@ -397,7 +400,7 @@ parseFl = do parseCm :: PandocMonad m => MdocParser m Inlines -parseCm = simpleInline "Cm" $ B.codeWith (cls "Cm") . stringify +parseCm = codeLikeInline "Cm" parseQl :: PandocMonad m => MdocParser m Inlines parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify From 162f48d77930baaf30a7be693f568f6ac0c93beb Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Fri, 20 Sep 2024 14:41:00 -0700 Subject: [PATCH 62/81] Zap useless comments --- src/Text/Pandoc/Readers/Mdoc.hs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index c32be9b05da2..987361252bec 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -347,7 +347,6 @@ eliminateEmpty x y = if null y then mempty else x y cls :: T.Text -> B.Attr cls x = (mempty, [x], mempty) --- Sy: callable, parsed, >0 arguments -- mandoc -T html formats Sy with a tag, since it's not really -- semantically , but Strong is our best option in Pandoc parseSy :: PandocMonad m => MdocParser m Inlines @@ -470,7 +469,6 @@ parseNm = do else ok x where ok = B.codeWith (cls "Nm") . stringify --- Xr parseXr :: PandocMonad m => MdocParser m Inlines parseXr = do macro "Xr" From 2cf842d04624e4d7f0306dade7ad7ab62bbbb385 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Fri, 20 Sep 2024 14:45:36 -0700 Subject: [PATCH 63/81] Remove useless do --- src/Text/Pandoc/Readers/Mdoc.hs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 987361252bec..9b6447b08e8a 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -289,8 +289,7 @@ simpleInline nm xform = do return $ openDelim <> xform inlines <> closeDelim codeLikeInline :: PandocMonad m => T.Text -> MdocParser m Inlines -codeLikeInline nm = do - simpleInline nm (eliminateEmpty (B.codeWith (cls nm) . stringify)) +codeLikeInline nm = simpleInline nm (eliminateEmpty (B.codeWith (cls nm) . stringify)) lineEnclosure :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines lineEnclosure nm xform = do From 91e17306292eb2a15605407d42c3a83a99ba44b8 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Fri, 20 Sep 2024 15:24:57 -0700 Subject: [PATCH 64/81] Add .Fl Fl support --- src/Text/Pandoc/Readers/Mdoc.hs | 7 ++++++- test/Tests/Readers/Mdoc.hs | 3 +++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 9b6447b08e8a..bb08e410fdb1 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -376,7 +376,7 @@ parsePa = simpleInline "Pa" p parseFl :: PandocMonad m => MdocParser m Inlines parseFl = do macro "Fl" - start <- option mempty (emptyWithDelim <|> emptyWithMacro <|> emptyEmpty) + start <- option mempty (emptyWithDelim <|> flfl <|> emptyWithMacro <|> emptyEmpty) segs <- manyTill segment inlineContextEnd return $ spacify ([start] <> segs) where @@ -384,6 +384,11 @@ parseFl = do lookAhead $ many1 (delim Middle <|> delim Close) ds <- closingDelimiters return $ fl "-" <> ds + flfl = do + lookAhead (macro "Fl") + x:xs <- B.toList <$> parseFl + let xx = B.codeWith (cls "Fl") $ "-" <> stringify x + return $ xx <> B.fromList xs emptyWithMacro = do lookAhead anyMacro rest <- parseInlines diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index a35f2fd9ca13..5d007e05dacd 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -106,6 +106,9 @@ tests = [ , "GNU escaped" =: ".Fl \\-help" =?> para (codeWith (cls "Fl") "--help") + , "GNU Fl Fl" =: + ".Fl Fl help" =?> + para (codeWith (cls "Fl") "--help") , "punctuation" =: ".Op Fl a | b" =?> para ("[" <> codeWith (cls "Fl") "-a" <> " | " <> codeWith (cls "Fl") "-b" <> "]") From 2a1688bb6c34e9c9a2c78a73c74f0a79221adbf2 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Fri, 20 Sep 2024 15:25:50 -0700 Subject: [PATCH 65/81] Add Ar macro --- src/Text/Pandoc/Readers/Mdoc.hs | 6 ++++++ test/Tests/Readers/Mdoc.hs | 3 +++ 2 files changed, 9 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index bb08e410fdb1..5a3166d47230 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -401,6 +401,11 @@ parseFl = do flags [] = ["-"] flags xs = map ("-" <>) xs +parseAr :: PandocMonad m => MdocParser m Inlines +parseAr = simpleInline "Ar" ar + where ar x | null x = B.codeWith (cls "variable") "file ..." + | otherwise = B.codeWith (cls "variable") $ stringify x + parseCm :: PandocMonad m => MdocParser m Inlines parseCm = codeLikeInline "Cm" @@ -530,6 +535,7 @@ parseInlineMacro = parsePa, parseFl, parseCm, + parseAr, parseNo, parseNm, parseXr, diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index 5d007e05dacd..e1bc78ef001b 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -70,6 +70,9 @@ tests = [ , "Ev" =: ".Ev HELLO_WORLD ," =?> para (codeWith (cls "Ev") "HELLO_WORLD" <> ",") + , "Ar" =: + ".Ar ) z" =?> + para (codeWith (cls "variable") "file ..." <> ") " <> codeWith (cls "variable") "z") , "Mt" =: ".Mt a@example.org , b@example.org" =?> para ((link "mailto:a@example.org" "" "a@example.org") <> From c9d1530cd6d43bb562d847574fcb766551898dbe Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Fri, 20 Sep 2024 16:09:01 -0700 Subject: [PATCH 66/81] Add more easy macros --- src/Text/Pandoc/Readers/Mdoc.hs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 5a3166d47230..a3e785efaab5 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -291,6 +291,9 @@ simpleInline nm xform = do codeLikeInline :: PandocMonad m => T.Text -> MdocParser m Inlines codeLikeInline nm = simpleInline nm (eliminateEmpty (B.codeWith (cls nm) . stringify)) +spanLikeInline :: PandocMonad m => T.Text -> MdocParser m Inlines +spanLikeInline nm = simpleInline nm (eliminateEmpty (B.spanWith (cls nm))) + lineEnclosure :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines lineEnclosure nm xform = do macro nm @@ -360,6 +363,12 @@ parseNo = simpleInline "No" (eliminateEmpty id) parseEv :: PandocMonad m => MdocParser m Inlines parseEv = codeLikeInline "Ev" +parseAd :: PandocMonad m => MdocParser m Inlines +parseAd = spanLikeInline "Ad" + +parseMs :: PandocMonad m => MdocParser m Inlines +parseMs = spanLikeInline "Ms" + -- I'm not sure why mandoc inserts a ~ when Mt is missing an argument, -- but it does, and it doesn't issue a warning, so that quirk is -- retained. @@ -410,6 +419,9 @@ parseAr = simpleInline "Ar" ar parseCm :: PandocMonad m => MdocParser m Inlines parseCm = codeLikeInline "Cm" +parseIc :: PandocMonad m => MdocParser m Inlines +parseIc = codeLikeInline "Ic" + parseQl :: PandocMonad m => MdocParser m Inlines parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify @@ -535,6 +547,9 @@ parseInlineMacro = parsePa, parseFl, parseCm, + parseIc, + parseAd, + parseMs, parseAr, parseNo, parseNm, From 9b63a48e6db99015ee115974e94a960517cb0f65 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Fri, 20 Sep 2024 16:09:30 -0700 Subject: [PATCH 67/81] Zap comment alluding to an abandoned helper --- src/Text/Pandoc/Readers/Mdoc.hs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index a3e785efaab5..b884e85bfd31 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -502,7 +502,6 @@ parseXr = do s <- lit "Xr manual section" return (toString n, toString s) --- TODO: can I rewrite this with argsInline? parseLk :: PandocMonad m => MdocParser m Inlines parseLk = do macro "Lk" From 8fe57d37832088bff854ecff5db64725d6c7ed68 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Fri, 20 Sep 2024 17:09:23 -0700 Subject: [PATCH 68/81] Add In macro (non-SYNOPSIS version) --- src/Text/Pandoc/Readers/Mdoc.hs | 15 +++++++++++++++ test/Tests/Readers/Mdoc.hs | 3 +++ 2 files changed, 18 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index b884e85bfd31..10ef60f9f848 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -502,6 +502,20 @@ parseXr = do s <- lit "Xr manual section" return (toString n, toString s) +parseIn :: PandocMonad m => MdocParser m Inlines +parseIn = do + macro "In" + openClose <- closingDelimiters + openOpen <- openingDelimiters + header <- toString <$> lit + close <- closingDelimiters + return $ open openClose openOpen <> B.codeWith (cls "In") ("<" <> header <> ">") <> close + where + open a b + | null a = b + | null b = a + | otherwise = a <> B.space <> b + parseLk :: PandocMonad m => MdocParser m Inlines parseLk = do macro "Lk" @@ -550,6 +564,7 @@ parseInlineMacro = parseAd, parseMs, parseAr, + parseIn, parseNo, parseNm, parseXr, diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index e1bc78ef001b..896b56ae7be3 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -73,6 +73,9 @@ tests = [ , "Ar" =: ".Ar ) z" =?> para (codeWith (cls "variable") "file ..." <> ") " <> codeWith (cls "variable") "z") + , "In" =: + ".In ( math.h ) b c" =?> + para ("(" <> codeWith (cls "In") "" <> ") b c") , "Mt" =: ".Mt a@example.org , b@example.org" =?> para ((link "mailto:a@example.org" "" "a@example.org") <> From dbf31cb30a69a8c49d399e2067483cd9d546485c Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Tue, 24 Sep 2024 14:31:32 -0700 Subject: [PATCH 69/81] Ignore .Ns at start of line in mdoc lexer Solves a delta with mandoc --- src/Text/Pandoc/Readers/Mdoc/Lex.hs | 7 ++++++- test/Tests/Readers/Mdoc.hs | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs index 3cf741101a02..a54e0915154c 100644 --- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs +++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs @@ -208,9 +208,14 @@ lexControlLine = do guard $ sourceColumn pos == 1 char '.' m@(Macro name _) <- lexMacro + -- .Ns macros at the start of a line are ignored. We'd have to look behind + -- to keep track of the "start of the line" in the parser, so we'll drop + -- those macros in lexing. + let start | name == "Ns" = [] + | otherwise = [m] let parsed = isParsedMacro name (wds, e) <- manyUntil (l parsed) eofline - return $ MdocTokens $ Seq.fromList $ (m:wds) <> [e] + return $ MdocTokens $ Seq.fromList $ start <> wds <> [e] where l True = try lexDelim <|> try lexCallableMacro <|> lexLit l False = try lexDelim <|> lexLit diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index 896b56ae7be3..c836869895f6 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -145,7 +145,7 @@ tests = [ , testGroup "Ns macro" [ "at the beginning of a macro line (mandoc delta)" =: T.unlines [".Op before", ".Ns Op after"] =?> - para "[before][after]" -- mandoc: warning + "[before] [after]" + para "[before] [after]" -- mandoc: warning , "after a block closing macro" =: T.unlines [".Oo before", ".Oc Ns Op after"] =?> para "[before][after]" From b8b7411d61112372d84bcef5a063a6ece03badc6 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Tue, 24 Sep 2024 14:35:49 -0700 Subject: [PATCH 70/81] Add Ap macro the edge case of "Ap (" tested in this mandoc regress isn't present in any actual OpenBSD base system manuals, where Ap is only ever followed by a letter. Furthermore, "Ap" is generally uncommon compared to "Ns '" (e.g. ".Xr mandoc 1 Ns 's"). I'm accepting a difference from mandoc here because correctly suppressing space after the "(" here would require more refactoring than I feel like doing at time of writing. --- src/Text/Pandoc/Readers/Mdoc.hs | 16 +++++++++++++++- test/Tests/Readers/Mdoc.hs | 16 ++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 10ef60f9f848..7851da051a67 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -307,6 +307,9 @@ lineEnclosure nm xform = do noSpace :: Inlines noSpace = B.rawInline "mdoc" "Ns" +apMacro :: Inlines +apMacro = B.rawInline "mdoc" "Ap" + data SpacifyState = SpacifyState { accum :: [Inlines], prev :: Inlines, ns :: Bool } instance Default SpacifyState where def = SpacifyState [] mempty False @@ -316,8 +319,9 @@ foldNoSpaces xs = (finalize . foldl go def) xs go :: SpacifyState -> Inlines -> SpacifyState go s x | ns s && x == noSpace = s - | ns s = s{prev = prev s <> x, ns = False} + | x == apMacro = s{prev = prev s <> "'", ns = True} | x == noSpace = s{ns = True} + | ns s = s{prev = prev s <> x, ns = False} | null (prev s) = s{prev = x} | otherwise = s{accum = accum s <> [prev s], prev = x} finalize s @@ -546,6 +550,15 @@ parseLk = do parseNs :: PandocMonad m => MdocParser m Inlines parseNs = macro "Ns" >> return noSpace +parseAp :: PandocMonad m => MdocParser m Inlines +parseAp = macro "Ap" >> return apMacro + +-- parseAp :: PandocMonad m => MdocParser m Inlines +-- parseAp = do +-- macro "Ap" +-- return $ B.singleton apMacro + + -- TODO should possibly rename this function b/c some of these are -- Mdoc block partial-implicit macros. Unclear if this distinction -- is going to be relevant. @@ -585,6 +598,7 @@ parseInlineMacro = parseBro, parseAo, parseOo, + parseAp, parseNs ] diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index c836869895f6..65f7ae88b4cd 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -174,6 +174,22 @@ tests = [ ".No no \"Ns\" ns \")\" No no" =?> para ("nons)" <> space <> "no") ] + , testGroup "Ap macro" + [ "in the middle of a macro line" =: + ".Xr mandoc 1 Ap s" =?> + para (spanWith (cls "Xr") "mandoc(1)" <> "'s") + -- mandoc difference: the edge case of "Ap (" tested in this mandoc regress + -- isn't present in any actual OpenBSD base system manuals, where Ap is + -- only ever followed by a letter. Furthermore, "Ap" is generally uncommon + -- compared to "Ns '" (e.g. ".Xr mandoc 1 Ns 's"). I'm accepting a + -- difference from mandoc here because correctly suppressing space after + -- the "(" here would require more refactoring than I feel like doing at + -- time of writing. + -- per mandoc, should be: para (strong "bold" <> "'(" <> strong "bold") + , "with punctuation and called macro" =: + ".Sy bold Ap ( \"Sy\" bold" =?> + para (strong "bold" <> "'( " <> strong "bold") + ] , testGroup "inline punctuation" [ testGroup "leading punctuation" [ "open paren" =: ".Em ( b" =?> para ("(" <> emph "b") From f7d07bebe948e19f2a8e693fe5aa0fd478c86696 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Tue, 24 Sep 2024 15:29:37 -0700 Subject: [PATCH 71/81] Add a couple more easy codelike macros --- src/Text/Pandoc/Readers/Mdoc.hs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 7851da051a67..b4985090cdfb 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -426,6 +426,12 @@ parseCm = codeLikeInline "Cm" parseIc :: PandocMonad m => MdocParser m Inlines parseIc = codeLikeInline "Ic" +parseEr :: PandocMonad m => MdocParser m Inlines +parseEr = codeLikeInline "Er" + +parseCd :: PandocMonad m => MdocParser m Inlines +parseCd = codeLikeInline "Cd" + parseQl :: PandocMonad m => MdocParser m Inlines parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify @@ -574,6 +580,8 @@ parseInlineMacro = parseFl, parseCm, parseIc, + parseEr, + parseCd, parseAd, parseMs, parseAr, From 3464d9cba65790084b902aad89b73ce66969a3b9 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Wed, 25 Sep 2024 14:30:34 -0700 Subject: [PATCH 72/81] tktktk: Sm support --- src/Text/Pandoc/Readers/Mdoc.hs | 123 ++++++++++++++++++++++---------- test/Tests/Readers/Mdoc.hs | 17 +++++ 2 files changed, 102 insertions(+), 38 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index b4985090cdfb..181c4b850c8c 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -1,5 +1,6 @@ {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE LambdaCase #-} {- | Module : Text.Pandoc.Readers.Mdoc Copyright : @@ -43,21 +44,28 @@ data MdocSection | ShOther deriving (Show, Eq) -data ManState = ManState { readerOptions :: ReaderOptions - , metadata :: Meta - , tableCellsPlain :: Bool - , progName :: Maybe T.Text - , currentSection :: MdocSection - } deriving Show - -instance Default ManState where - def = ManState { readerOptions = def - , metadata = B.nullMeta - , tableCellsPlain = True - , currentSection = ShOther - , progName = Nothing } - -type MdocParser m = P.ParsecT [MdocToken] ManState m +data MdocState = MdocState + { readerOptions :: ReaderOptions + , metadata :: Meta + , tableCellsPlain :: Bool + , spacingMode :: Bool + , progName :: Maybe T.Text + , currentSection :: MdocSection + } + deriving (Show) + +instance Default MdocState where + def = + MdocState + { readerOptions = def + , metadata = B.nullMeta + , tableCellsPlain = True + , spacingMode = True + , currentSection = ShOther + , progName = Nothing + } + +type MdocParser m = P.ParsecT [MdocToken] MdocState m -- | Read mdoc from an input string and return a Pandoc document. @@ -68,15 +76,15 @@ readMdoc :: (PandocMonad m, ToSources a) readMdoc opts s = do let Sources inps = toSources s tokenz <- mconcat <$> mapM (uncurry lexMdoc) inps - let state = def {readerOptions = opts} :: ManState + let state = def {readerOptions = opts} :: MdocState eitherdoc <- readWithMTokens parseMdoc state (Foldable.toList . unRoffTokens $ tokenz) either (throwError . fromParsecError (Sources inps)) return eitherdoc readWithMTokens :: PandocMonad m - => ParsecT [MdocToken] ManState m a -- ^ parser - -> ManState -- ^ initial state + => ParsecT [MdocToken] MdocState m a -- ^ parser + -> MdocState -- ^ initial state -> [MdocToken] -- ^ input -> m (Either ParseError a) readWithMTokens parser state input = @@ -172,7 +180,7 @@ argsToInlines :: PandocMonad m => MdocParser m Inlines argsToInlines = do ls <- manyTill arg eol let strs = map (B.str . toString) ls - return $ spacify strs + spacify strs parsePrologue :: PandocMonad m => MdocParser m () parsePrologue = do @@ -228,6 +236,15 @@ parseStr = do (Str txt _) <- str return $ B.str txt +-- Multiple consecutive strs within a block always need to get spaces between +-- them and then packed up together, because text lines are never affected by +-- the spacing mode. XXX but apparently this isn't actually true for mandoc, +-- so I'm not sure what's correct here yet. +parseStrs :: PandocMonad m => MdocParser m Inlines +parseStrs = do + txt <- many1 parseStr + return $ mconcat $ intersperse B.space txt + parseDelim :: PandocMonad m => DelimSide -> MdocParser m Inlines parseDelim pos = do (Delim _ txt _) <- delim pos @@ -242,14 +259,14 @@ litsToInlines :: PandocMonad m => MdocParser m Inlines litsToInlines = do ls <- many1 lit let strs = map (B.str . toString) ls - return $ spacify strs + spacify strs litsAndDelimsToInlines :: PandocMonad m => MdocParser m Inlines litsAndDelimsToInlines = do (o, ls, c) <- delimitedArgs $ many lit guard $ not (null o && null ls && null c) - let strs = map (B.str . toString) ls - return $ o <> spacify strs <> c + strs <- spacify $ map (B.str . toString) ls + return $ o <> strs <> c openingDelimiters :: PandocMonad m => MdocParser m Inlines openingDelimiters = do @@ -260,7 +277,7 @@ openingDelimiters = do return $ openDelim <> omid pipes :: PandocMonad m => MdocParser m Inlines -pipes = spacify <$> many (parseDelim Middle) +pipes = many (parseDelim Middle) >>= spacify closingDelimiters :: PandocMonad m => MdocParser m Inlines closingDelimiters = do @@ -282,7 +299,7 @@ simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m simpleInline nm xform = do macro nm segs <- manyTill segment inlineContextEnd - return $ spacify segs + spacify segs where segment = do (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToInlines @@ -302,7 +319,8 @@ lineEnclosure nm xform = do (manyTill (parseInlineMacro <|> (try (litsAndDelimsToInlines <* notFollowedBy eol)) <|> litsToInlines) (lookAhead (many (macro "Ns" <|> delim Close) *> eol))) - return $ first <> xform (spacify further) <> finally + further' <- spacify further + return $ first <> xform further' <> finally noSpace :: Inlines noSpace = B.rawInline "mdoc" "Ns" @@ -310,8 +328,14 @@ noSpace = B.rawInline "mdoc" "Ns" apMacro :: Inlines apMacro = B.rawInline "mdoc" "Ap" -data SpacifyState = SpacifyState { accum :: [Inlines], prev :: Inlines, ns :: Bool } -instance Default SpacifyState where def = SpacifyState [] mempty False +smOff :: Inlines +smOff = B.rawInline "mdoc" "Sm off" + +smOn :: Inlines +smOn = B.rawInline "mdoc" "Sm on" + +data SpacifyState = SpacifyState { accum :: [Inlines], prev :: Inlines, ns :: Bool, sm :: Bool} +instance Default SpacifyState where def = SpacifyState [] mempty False True foldNoSpaces :: [Inlines] -> [Inlines] foldNoSpaces xs = (finalize . foldl go def) xs @@ -321,15 +345,23 @@ foldNoSpaces xs = (finalize . foldl go def) xs | ns s && x == noSpace = s | x == apMacro = s{prev = prev s <> "'", ns = True} | x == noSpace = s{ns = True} + | x == smOn = s{sm = True} + | sm s && x == smOff = s{accum = accum s <> [prev s], prev = mempty, sm = False} | ns s = s{prev = prev s <> x, ns = False} + | not (sm s) = s{prev = prev s <> x} | null (prev s) = s{prev = x} | otherwise = s{accum = accum s <> [prev s], prev = x} finalize s | null (prev s) = accum s | otherwise = accum s <> [prev s] -spacify :: [Inlines] -> Inlines -spacify = mconcat . intersperse B.space . foldNoSpaces +spacify :: PandocMonad m => [Inlines] -> MdocParser m Inlines +spacify x = do + mode <- spacingMode <$> getState + return (go mode x) + where + go True = mconcat . intersperse B.space . foldNoSpaces + go False = mconcat . foldNoSpaces {- Compatibility note: mandoc permits, and doesn't warn on, "vertical" macros (Pp, Bl/El, Bd/Ed) inside of "horizontal" block partial-explicit quotations @@ -345,7 +377,8 @@ multilineEnclosure op cl xform = do (macro cl show cl) closeDelim <- mconcat <$> many (parseDelim Close) optional eol - return $ openDelim <> xform (spacify contents) <> closeDelim + contents' <- spacify contents + return $ openDelim <> xform contents' <> closeDelim eliminateEmpty :: (Inlines -> Inlines) -> Inlines -> Inlines eliminateEmpty x y = if null y then mempty else x y @@ -391,7 +424,7 @@ parseFl = do macro "Fl" start <- option mempty (emptyWithDelim <|> flfl <|> emptyWithMacro <|> emptyEmpty) segs <- manyTill segment inlineContextEnd - return $ spacify ([start] <> segs) + spacify ([start] <> segs) where emptyWithDelim = do lookAhead $ many1 (delim Middle <|> delim Close) @@ -409,7 +442,8 @@ parseFl = do emptyEmpty = lookAhead eol $> fl "-" segment = do (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText - return $ openDelim <> (spacify . (map fl) . flags) inlines <> closeDelim + inner <- (spacify . (map fl) . flags) inlines + return $ openDelim <> inner <> closeDelim fl = B.codeWith (cls "Fl") flags [] = ["-"] flags xs = map ("-" <>) xs @@ -532,7 +566,7 @@ parseLk = do openClose <- closingDelimiters openOpen <- openingDelimiters url <- toString <$> lit - inner <- spacify <$> many segment + inner <- many segment >>= spacify close <- closingDelimiters let label | null inner = B.str url | otherwise = inner @@ -611,13 +645,12 @@ parseInlineMacro = ] parseInline :: PandocMonad m => MdocParser m Inlines -parseInline = parseStr <|> - ((parseInlineMacro <|> litsAndDelimsToInlines) <* optional eol) - +parseInline = parseStrs <|> (controlLine >>= spacify) + where + controlLine = many1 ((parseInlineMacro <|> litsAndDelimsToInlines) <* optional eol) --- TODO probably need some kind of fold to deal with Ns parseInlines :: PandocMonad m => MdocParser m Inlines -parseInlines = spacify <$> many1 parseInline +parseInlines = many1 (parseSmToggle <|> parseInline) >>= spacify parsePara :: PandocMonad m => MdocParser m Blocks parsePara = do @@ -643,6 +676,20 @@ parseCodeBlock = do skipBlanks :: PandocMonad m => MdocParser m Blocks skipBlanks = many1 blank *> mempty +parseSmToggle :: PandocMonad m => MdocParser m Inlines +parseSmToggle = do + macro "Sm" + cur <- spacingMode <$> getState + mode <- optionMaybe (literal "on" $> True <|> literal "off" $> False) + eol + let newMode = update mode cur + modifyState $ \s -> s{spacingMode = newMode} + return $ if newMode then smOn else smOff + where + update = \case + Nothing -> not + Just x -> const x + parseBlock :: PandocMonad m => MdocParser m Blocks parseBlock = choice [ -- parseList -- , parseDefinitionList diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index 65f7ae88b4cd..b3de25a09ee8 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -174,6 +174,23 @@ tests = [ ".No no \"Ns\" ns \")\" No no" =?> para ("nons)" <> space <> "no") ] + , testGroup "spacing mode" + [ "all text" =: + T.unlines ["a", ".Sm off", "b c", "d", ".Sm on", "e"] =?> + para ("a " <> str "b c" <> " d e") + , "text around macro" =: + T.unlines ["a", ".Sm off", ".Sy b c", ".Sm on", "d"] =?> + para ("a" <> space <> strong "bc" <> space <> "d") + , "mulitple macros" =: + T.unlines ["a", ".Sm off", ".Sy b Em c", ".Sm on", "d"] =?> + para ("a" <> space <> strong "b" <> emph "c" <> space <> "d") + , "mulitple control lines" =: + T.unlines ["a", ".Sm off", ".Sy b", ".Em c", ".Sm on", "d"] =?> + para ("a" <> space <> strong "b" <> emph "c" <> space <> "d") + , "mixed control and text lines" =: + T.unlines ["a", ".Sm off", ".Sy b", "c", ".Em d", ".Sm on", "d"] =?> + para ("a" <> space <> strong "b" <> "c" <> emph "d" <> space <> "d") + ] , testGroup "Ap macro" [ "in the middle of a macro line" =: ".Xr mandoc 1 Ap s" =?> From 3b74c82e52abb1cac525716f60d71d9c230c68f8 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 26 Sep 2024 14:36:56 -0700 Subject: [PATCH 73/81] Abandon whitespace-preservation in text lines It ends up with bad results in the ANSI writer, for example, because it then can't break lines at Spaces. This isn't wholly inconsistent with mandoc, because it makes no effort to render multiple consecutive spaces from the source document in HTML. --- src/Text/Pandoc/Readers/Mdoc.hs | 5 +---- test/Tests/Readers/Mdoc.hs | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 181c4b850c8c..b60f510434f2 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -228,13 +228,10 @@ parseSynopsisSection = do guard $ sec == ShSynopsis return mempty --- parseStr doesn't use B.text because roff(7) specifies that --- whitespace in text lines is treated literally. --- XXX is this what we actually want? parseStr :: PandocMonad m => MdocParser m Inlines parseStr = do (Str txt _) <- str - return $ B.str txt + return $ B.text txt -- Multiple consecutive strs within a block always need to get spaces between -- them and then packed up together, because text lines are never affected by diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs index b3de25a09ee8..77fc529c90f0 100644 --- a/test/Tests/Readers/Mdoc.hs +++ b/test/Tests/Readers/Mdoc.hs @@ -177,7 +177,7 @@ tests = [ , testGroup "spacing mode" [ "all text" =: T.unlines ["a", ".Sm off", "b c", "d", ".Sm on", "e"] =?> - para ("a " <> str "b c" <> " d e") + para ("a b c d e") , "text around macro" =: T.unlines ["a", ".Sm off", ".Sy b c", ".Sm on", "d"] =?> para ("a" <> space <> strong "bc" <> space <> "d") From 681e1d3f5a4538fff21756e120e4bbb32cba7638 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 26 Sep 2024 15:26:40 -0700 Subject: [PATCH 74/81] Skip to the end of the SYNOPSIS for now --- src/Text/Pandoc/Readers/Mdoc.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index b60f510434f2..24774eec9f60 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -226,6 +226,8 @@ parseSynopsisSection :: PandocMonad m => MdocParser m Blocks parseSynopsisSection = do sec <- currentSection <$> getState guard $ sec == ShSynopsis + -- TODO actually implement this + manyTill (anyToken) (lookAhead (macro "Sh")) return mempty parseStr :: PandocMonad m => MdocParser m Inlines From 9d956b8c3122debfb3d641fa66af1ba3f53ccded Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 26 Sep 2024 15:27:45 -0700 Subject: [PATCH 75/81] Reformat --- src/Text/Pandoc/Readers/Mdoc.hs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 24774eec9f60..34a1d377982e 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -333,8 +333,15 @@ smOff = B.rawInline "mdoc" "Sm off" smOn :: Inlines smOn = B.rawInline "mdoc" "Sm on" -data SpacifyState = SpacifyState { accum :: [Inlines], prev :: Inlines, ns :: Bool, sm :: Bool} -instance Default SpacifyState where def = SpacifyState [] mempty False True +data SpacifyState = SpacifyState + { accum :: [Inlines], + prev :: Inlines, + ns :: Bool, + sm :: Bool + } + +instance Default SpacifyState where + def = SpacifyState [] mempty False True foldNoSpaces :: [Inlines] -> [Inlines] foldNoSpaces xs = (finalize . foldl go def) xs From 7e0f712b3dfe7f26ae991d3406d45660e49ceb6d Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 26 Sep 2024 15:28:19 -0700 Subject: [PATCH 76/81] Fix .Nm parsing and insertion of manual name --- src/Text/Pandoc/Readers/Mdoc.hs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 34a1d377982e..dc166247329b 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -528,17 +528,17 @@ parseAo = multilineEnclosure "Ao" "Ac" $ \x -> "⟨" <> x <> "⟩" parseNm :: PandocMonad m => MdocParser m Inlines parseNm = do + macro "Nm" mnm <- (progName <$> getState) - case mnm of - Nothing -> do - (_, nm, _) <- lookAhead $ delimitedArgs $ option mempty litsToInlines - guard $ not (null nm) - simpleInline "Nm" ok - Just nm -> simpleInline "Nm" $ \x -> - if null x - then B.codeWith (cls "Nm") nm - else ok x - where ok = B.codeWith (cls "Nm") . stringify + (op, rg, cl) <- delimitedArgs $ option mempty litsToInlines + return $ case (mnm, rg) of + (Just nm, x) | null x -> + op <> ok nm <> cl + (_, x) -> + op <> (ok . stringify) x <> cl + where + ok = B.codeWith (cls "Nm") + parseXr :: PandocMonad m => MdocParser m Inlines parseXr = do From 638d84a7453d91bdd791134ea76486827b72a067 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 26 Sep 2024 15:32:28 -0700 Subject: [PATCH 77/81] Skip unknown macros in block context Getting to the point where I can start working with real manual pages so this is helpful. --- src/Text/Pandoc/Readers/Mdoc.hs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index dc166247329b..63aad7df69f7 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -696,6 +696,9 @@ parseSmToggle = do Nothing -> not Just x -> const x +skipUnknownMacro :: PandocMonad m => MdocParser m Blocks +skipUnknownMacro = anyMacro *> manyTill anyToken eol $> mempty + parseBlock :: PandocMonad m => MdocParser m Blocks parseBlock = choice [ -- parseList -- , parseDefinitionList @@ -708,6 +711,6 @@ parseBlock = choice [ -- parseList , skipBlanks -- , parseBlockQuote -- , parseNewParagraph - -- , skipUnknownMacro + , skipUnknownMacro ] From 6ed129f91693a9018e9b115387ffb871b817b327 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 26 Sep 2024 16:47:32 -0700 Subject: [PATCH 78/81] Parse display blocks and 1-line literals A bit janky but worse things have happened. --- src/Text/Pandoc/Readers/Mdoc.hs | 69 ++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 63aad7df69f7..6fe6224f2980 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -526,6 +526,12 @@ parseAq = lineEnclosure "Aq" $ \x -> "⟨" <> x <> "⟩" parseAo :: PandocMonad m => MdocParser m Inlines parseAo = multilineEnclosure "Ao" "Ac" $ \x -> "⟨" <> x <> "⟩" +parseDl :: PandocMonad m => MdocParser m Blocks +parseDl = do + inner <- lineEnclosure "Dl" id + eol + return $ B.codeBlock (stringify inner) + parseNm :: PandocMonad m => MdocParser m Inlines parseNm = do macro "Nm" @@ -648,20 +654,45 @@ parseInlineMacro = parseOo, parseAp, parseNs - ] + ] "inline macro" parseInline :: PandocMonad m => MdocParser m Inlines -parseInline = parseStrs <|> (controlLine >>= spacify) +parseInline = parseStrs <|> (controlLine >>= spacify) "text lines or inline macros" where controlLine = many1 ((parseInlineMacro <|> litsAndDelimsToInlines) <* optional eol) parseInlines :: PandocMonad m => MdocParser m Inlines parseInlines = many1 (parseSmToggle <|> parseInline) >>= spacify +-- Lp is a deprecated synonym for Pp parsePara :: PandocMonad m => MdocParser m Blocks -parsePara = do - optional (emptyMacro "Pp" <|> emptyMacro "Lp") -- Lp: deprecated synonym for Pp - B.para . B.trimInlines <$> parseInlines +parsePara = B.para . B.trimInlines <$> parseInlines <* + optional (emptyMacro "Pp" <|> emptyMacro "Lp") + +-- Indented display blocks are visually similar to block quotes +-- but rarely carry those semantics. I'm just putting things in +-- divs. Centered is discouraged and rarely seen. +parseDisplay :: PandocMonad m => MdocParser m Blocks +parseDisplay = do + literal "-filled" <|> literal "-ragged" <|> literal "-centered" + optional (literal "-offset" *> lit) + optional (literal "-compact") + eol + B.divWith (cls "display") . mconcat <$> many parseRegularBlock + +-- This is something of a best-effort interpretation of the -unfilled +-- display block type. The main difference with mandoc is probably +-- that newlines inside of multiline enclosures won't be preserved. +parseUnfilled :: PandocMonad m => MdocParser m Blocks +parseUnfilled = do + literal "-unfilled" + optional (literal "-offset" *> lit) + optional (literal "-compact") + eol + lns <- many (parseStr <|> (blank $> mempty) <|> (controlLine >>= spacify)) + return $ B.lineBlock lns + where + controlLine = many1 ((parseInlineMacro <|> litsAndDelimsToInlines)) <* optional eol -- CodeBlocks can't contain any other markup, but mdoc -- still interprets control lines within .Bd -literal @@ -669,15 +700,20 @@ parsePara = do -- we get any control lines inside a Bd literal parseCodeBlock :: PandocMonad m => MdocParser m Blocks parseCodeBlock = do - macro "Bd" -- TODO will need to hoist literal "-literal" optional (literal "-offset" *> lit) optional (literal "-compact") eol l <- T.unlines . map toString <$> many (str <|> blank) - emptyMacro "Ed" return $ B.codeBlock l +parseBd :: PandocMonad m => MdocParser m Blocks +parseBd = do + macro "Bd" + blk <- parseCodeBlock <|> parseDisplay <|> parseUnfilled + emptyMacro "Ed" + return blk + skipBlanks :: PandocMonad m => MdocParser m Blocks skipBlanks = many1 blank *> mempty @@ -699,18 +735,23 @@ parseSmToggle = do skipUnknownMacro :: PandocMonad m => MdocParser m Blocks skipUnknownMacro = anyMacro *> manyTill anyToken eol $> mempty +parseRegularBlock :: PandocMonad m => MdocParser m Blocks +parseRegularBlock = + choice + [ parseDl + , parsePara + , emptyMacro "Pp" *> mempty + , parseBd + , skipBlanks + , skipUnknownMacro + ] + parseBlock :: PandocMonad m => MdocParser m Blocks parseBlock = choice [ -- parseList -- , parseDefinitionList parseHeader , parseNameSection , parseSynopsisSection - , parsePara - -- , parseTable - , parseCodeBlock - , skipBlanks - -- , parseBlockQuote - -- , parseNewParagraph - , skipUnknownMacro + , parseRegularBlock ] From 2d94ad6b2dfbbbfab760d298cb471ccf642d04a3 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 26 Sep 2024 21:25:09 -0700 Subject: [PATCH 79/81] Move skipUnknownMacro to outer blocks Was having probalos --- src/Text/Pandoc/Readers/Mdoc.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 6fe6224f2980..7efa4f0f440a 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -743,7 +743,6 @@ parseRegularBlock = , emptyMacro "Pp" *> mempty , parseBd , skipBlanks - , skipUnknownMacro ] parseBlock :: PandocMonad m => MdocParser m Blocks @@ -753,5 +752,6 @@ parseBlock = choice [ -- parseList , parseNameSection , parseSynopsisSection , parseRegularBlock + , skipUnknownMacro ] From 2589dbbb323b8c5e6c743ca9ca1e14db54c25c04 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 26 Sep 2024 21:25:47 -0700 Subject: [PATCH 80/81] Add Dv macro --- src/Text/Pandoc/Readers/Mdoc.hs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index 7efa4f0f440a..c10e5e3f51c0 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -406,6 +406,9 @@ parseNo = simpleInline "No" (eliminateEmpty id) parseEv :: PandocMonad m => MdocParser m Inlines parseEv = codeLikeInline "Ev" +parseDv :: PandocMonad m => MdocParser m Inlines +parseDv = codeLikeInline "Dv" + parseAd :: PandocMonad m => MdocParser m Inlines parseAd = spanLikeInline "Ad" @@ -621,6 +624,7 @@ parseInlineMacro = parseEm, parseLk, parseEv, + parseDv, parseMt, parsePa, parseFl, From 2147b2f6528b5df130d9a27d15cbe9334b4d3a49 Mon Sep 17 00:00:00 2001 From: Evan Silberman Date: Thu, 26 Sep 2024 21:27:20 -0700 Subject: [PATCH 81/81] Add bullet, ordered, and definition lists --- src/Text/Pandoc/Readers/Mdoc.hs | 40 +++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs index c10e5e3f51c0..6643ea588116 100644 --- a/src/Text/Pandoc/Readers/Mdoc.hs +++ b/src/Text/Pandoc/Readers/Mdoc.hs @@ -718,6 +718,45 @@ parseBd = do emptyMacro "Ed" return blk +parseItemList :: PandocMonad m => MdocParser m Blocks +parseItemList = do + f <- (choice (map literal ["-bullet", "-dash", "-hyphen", "-item"]) $> B.bulletList) + <|> literal "-enum" $> B.orderedList + optional (literal "-width" *> lit) + optional (literal "-offset" *> lit) + optional (literal "-compact") + eol + items <- many bulletItem + return $ f items + where + bulletItem = do + emptyMacro "It" + mconcat <$> many parseRegularBlock + +parseDefinitionList :: PandocMonad m => MdocParser m Blocks +parseDefinitionList = do + choice $ map literal ["-hang", "-inset", "-ohang", "-tag"] + optional (literal "-width" *> lit) + optional (literal "-offset" *> lit) + optional (literal "-compact") + eol + items <- many dlItem + return $ B.definitionList items + where + dlItem = do + macro "It" + dt <- listHead >>= spacify + dd <- mconcat <$> many parseRegularBlock + return (dt, [dd]) + -- TODO support Xo/Xc + listHead = many1 ((parseInlineMacro <|> litsAndDelimsToInlines)) <* optional eol + +parseBl :: PandocMonad m => MdocParser m Blocks +parseBl = do + macro "Bl" + blk <- parseItemList <|> parseDefinitionList + emptyMacro "El" + return blk skipBlanks :: PandocMonad m => MdocParser m Blocks skipBlanks = many1 blank *> mempty @@ -746,6 +785,7 @@ parseRegularBlock = , parsePara , emptyMacro "Pp" *> mempty , parseBd + , parseBl , skipBlanks ]