From 68ec7887adad160d75bb9c41e6d74e331900f2e9 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Mon, 9 Sep 2024 16:12:49 -0700
Subject: [PATCH 01/81] Start Mdoc lexer

---
 src/Text/Pandoc/Readers/Mdoc/Lex.hs | 331 ++++++++++++++++++++++++++++
 1 file changed, 331 insertions(+)
 create mode 100644 src/Text/Pandoc/Readers/Mdoc/Lex.hs

diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
new file mode 100644
index 000000000000..ad380a9b973c
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -0,0 +1,331 @@
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ViewPatterns #-}
+{- |
+   Module      : Text.Pandoc.Readers.Mdoc.Lex
+   Copyright   : Copyright (C) 2018-2020 Yan Pashkovsky and John MacFarlane
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Yan Pashkovsky <yanp.bugz@gmail.com>
+   Stability   : WIP
+   Portability : portable
+
+Tokenizer for roff formats (man, ms).
+-}
+module Text.Pandoc.Readers.Mdoc.Lex
+  ( MdocToken(..)
+  , MdocTokens(..)
+  , lexMdoc
+  )
+where
+
+import Safe (lastDef)
+import Control.Monad (void, mzero, mplus, guard)
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Class.PandocMonad
+       (getResourcePath, readFileFromDirs, PandocMonad(..), report)
+import Data.Char (isLower, toLower, toUpper, chr, isAscii, isAlphaNum)
+import Data.Default (Default)
+import qualified Data.Map as M
+import Data.List (intercalate)
+import qualified Data.Text as T
+import Text.Pandoc.Logging (LogMessage(..))
+import Text.Pandoc.Options
+import Text.Pandoc.Parsing
+import Text.Pandoc.Shared (safeRead)
+import Text.Pandoc.RoffChar (characterCodes, combiningAccents)
+import qualified Data.Sequence as Seq
+import qualified Data.Foldable as Foldable
+import qualified Data.Text.Normalize as Normalize
+
+-- import Debug.Trace (traceShowId)
+
+--
+-- Data Types
+--
+type TableOption = (T.Text, T.Text)
+
+
+data CellFormat =
+  CellFormat
+  { columnType     :: Char
+  , pipePrefix     :: Bool
+  , pipeSuffix     :: Bool
+  , columnSuffixes :: [T.Text]
+  } deriving (Show, Eq, Ord)
+
+type TableRow = ([CellFormat], [MdocTokens])
+
+data MdocToken = Str T.Text SourcePos
+               | Macro T.Text SourcePos
+               | Lit T.Text SourcePos
+               | Tbl [TableOption] [TableRow] SourcePos
+               deriving Show
+
+newtype MdocTokens = MdocTokens { unRoffTokens :: Seq.Seq MdocToken }
+        deriving (Show, Semigroup, Monoid)
+
+singleTok :: MdocToken -> MdocTokens
+singleTok t = MdocTokens (Seq.singleton t)
+
+data RoffState = RoffState { tableTabChar :: Char
+                           } deriving Show
+
+instance Default RoffState where
+  def = RoffState { tableTabChar = '\t'
+                  }
+
+type RoffLexer m = ParsecT Sources RoffState m
+
+--
+-- Lexer: T.Text -> RoffToken
+--
+
+eofline :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m ()
+eofline = void newline <|> eof
+
+spacetab :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m Char
+spacetab = char ' ' <|> char '\t'
+
+characterCodeMap :: M.Map T.Text Char
+characterCodeMap =
+  M.fromList $ map (\(x,y) -> (y,x)) characterCodes
+
+combiningAccentsMap :: M.Map T.Text Char
+combiningAccentsMap =
+  M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
+
+escape :: PandocMonad m => RoffLexer m T.Text
+escape = try $ do
+  backslash
+  escapeGlyph <|> escapeNormal
+
+escapeGlyph :: PandocMonad m => RoffLexer m T.Text
+escapeGlyph = do
+  c <- lookAhead (oneOf ['[','('])
+  escapeArg >>= resolveGlyph c
+
+resolveGlyph :: PandocMonad m => Char -> T.Text -> RoffLexer m T.Text
+resolveGlyph delimChar glyph = do
+  let cs = T.replace "_u" " u" glyph -- unicode glyphs separated by _
+  (case T.words cs of
+      []  -> mzero
+      [s] -> case M.lookup s characterCodeMap `mplus` readUnicodeChar s of
+               Nothing -> mzero
+               Just c  -> return $ T.singleton c
+      (s:ss) -> do
+        basechar <- case M.lookup s characterCodeMap `mplus`
+                         readUnicodeChar s of
+                      Nothing ->
+                        case T.unpack s of
+                          [ch] | isAscii ch && isAlphaNum ch ->
+                                 return ch
+                          _ -> mzero
+                      Just c  -> return c
+        let addAccents [] xs = return $ Normalize.normalize Normalize.NFC $
+                                        T.reverse xs
+            addAccents (a:as) xs =
+              case M.lookup a combiningAccentsMap `mplus` readUnicodeChar a of
+                Just x  -> addAccents as $ T.cons x xs
+                Nothing -> mzero
+        addAccents ss (T.singleton basechar) >>= \xs -> return xs)
+      <|> case delimChar of
+            '['  -> escUnknown ("\\[" <> glyph <> "]")
+            '('  -> escUnknown ("\\(" <> glyph)
+            '\'' -> escUnknown ("\\C'" <> glyph <> "'")
+            _    -> Prelude.fail "resolveGlyph: unknown glyph delimiter"
+
+readUnicodeChar :: T.Text -> Maybe Char
+readUnicodeChar t = case T.uncons t of
+  Just ('u', cs) | T.length cs > 3 -> chr <$> safeRead ("0x" <> cs)
+  _ -> Nothing
+
+escapeNormal :: PandocMonad m => RoffLexer m T.Text
+escapeNormal = do
+  c <- noneOf "{}"
+  let groffSkip = [escapeArg, countChar 1 (satisfy (/='\n'))]
+  case c of
+    ' ' -> return " " -- mandoc_char(7) says this should be a nonbreaking space
+    '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
+    '#' -> mempty <$ manyTill anyChar newline
+    '%' -> return mempty  -- optional hyphenation
+    '&' -> return mempty  -- nonprintable zero-width
+    ')' -> return mempty  -- nonprintable zero-width
+    '*' -> escIgnore '*' groffSkip
+    ',' -> return mempty  -- to fix spacing after roman
+    '-' -> return "-"
+    '.' -> return "."
+    '/' -> return mempty  -- to fix spacing before roman
+    '0' -> return "\x2007" -- digit-width space
+    ':' -> return mempty  -- zero-width break
+    'A' -> escIgnore 'A' [quoteArg]
+    'B' -> escIgnore 'B' [quoteArg]
+    'C' -> quoteArg >>= resolveGlyph '\''
+    'D' -> escIgnore 'D' [quoteArg]
+    'F' -> escIgnore 'F' groffSkip
+    'H' -> escIgnore 'H' [quoteArg]
+    'L' -> escIgnore 'L' [quoteArg]
+    'M' -> escIgnore 'M' groffSkip
+    'N' -> escIgnore 'N' [quoteArg]
+    'O' -> escIgnore 'O' groffSkip
+    'R' -> escIgnore 'R' [quoteArg]
+    'S' -> escIgnore 'S' [quoteArg]
+    'V' -> escIgnore 'V' groffSkip
+    'X' -> escIgnore 'X' [quoteArg]
+    'Y' -> escIgnore 'Y' groffSkip
+    'Z' -> escIgnore 'Z' [quoteArg]
+    '\'' -> return "'"
+    '\n' -> return mempty  -- line continuation
+    '^' -> return "\x200A" -- 1/12 em space
+    '_' -> return "_"
+    '`' -> return "`"
+    'a' -> return mempty  -- "non-interpreted leader character"
+    'b' -> escIgnore 'b' [quoteArg]
+    'c' -> return mempty  -- interrupt text processing
+    'd' -> escIgnore 'd' [] -- forward down 1/2em
+    'e' -> return "\\"
+    'f' -> escIgnore 'f' groffSkip
+    'g' -> escIgnore 'g' groffSkip
+    'h' -> escIgnore 'h' [quoteArg]
+    'k' -> escIgnore 'k' groffSkip
+    'l' -> escIgnore 'l' [quoteArg]
+    'm' -> escIgnore 'm' groffSkip
+    'n' -> escIgnore 'm' groffSkip
+    'o' -> escIgnore 'o' [quoteArg]
+    'p' -> escIgnore 'p' []
+    'r' -> escIgnore 'r' []
+    's' -> escIgnore 's' [escapeArg, signedNumber]
+    't' -> return "\t"
+    'u' -> escIgnore 'u' []
+    'v' -> escIgnore 'v' [quoteArg]
+    'w' -> escIgnore 'w' [quoteArg]
+    'x' -> escIgnore 'x' [quoteArg]
+    'z' -> escIgnore 'z' [countChar 1 anyChar]
+    '|' -> return "\x2006" --1/6 em space
+    '~' -> return "\160" -- nonbreaking space
+    '\\' -> return "\\"
+    _   -> return $ T.singleton c
+    -- man 7 groff: "If  a  backslash  is followed by a character that
+    -- does not constitute a defined escape sequence, the backslash
+    -- is  silently  ignored  and  the character maps to itself."
+
+escIgnore :: PandocMonad m
+          => Char
+          -> [RoffLexer m T.Text]
+          -> RoffLexer m T.Text
+escIgnore c argparsers = do
+  pos <- getPosition
+  arg <- snd <$> withRaw (choice argparsers) <|> return ""
+  report $ SkippedContent ("\\" <> T.cons c arg) pos
+  return mempty
+
+escUnknown :: PandocMonad m => T.Text -> RoffLexer m T.Text
+escUnknown s = do
+  pos <- getPosition
+  report $ SkippedContent s pos
+  return "\xFFFD"
+
+signedNumber :: PandocMonad m => RoffLexer m T.Text
+signedNumber = try $ do
+  sign <- option "" ("-" <$ char '-' <|> "" <$ char '+')
+  ds <- many1Char digit
+  return (sign <> ds)
+
+-- Parses: [..] or (..
+escapeArg :: PandocMonad m => RoffLexer m T.Text
+escapeArg = choice
+    [ char '[' *> manyTillChar (noneOf ['\n',']']) (char ']')
+    , char '(' *> countChar 2 (satisfy (/='\n'))
+    ]
+
+-- Parses: '..'
+quoteArg :: PandocMonad m => RoffLexer m T.Text
+quoteArg = char '\'' *> manyTillChar (noneOf ['\n','\'']) (char '\'')
+
+-- separate function from lexMacro since real man files sometimes do not
+-- follow the rules
+lexComment :: PandocMonad m => RoffLexer m MdocTokens
+lexComment = do
+  try $ string ".\\\""
+  skipMany $ noneOf "\n"
+  eofline
+  return mempty
+
+
+argText :: PandocMonad m => RoffLexer m T.Text
+argText = mconcat <$> many1 (escape <|> regularText)
+
+spaceTabChar :: PandocMonad m => RoffLexer m T.Text
+spaceTabChar = T.singleton <$> spacetab
+
+quotedArg :: PandocMonad m => RoffLexer m T.Text
+quotedArg = do
+  quoteChar
+  t <- mconcat <$> many (escape <|> regularText <|> innerQuote <|> spaceTabChar)
+  quoteChar
+  notFollowedBy quoteChar
+  return t
+  where
+    innerQuote = do
+      string "\"\""
+      return "\""
+
+anyText :: PandocMonad m => RoffLexer m T.Text
+anyText = escape <|> regularText <|> quoteChar <|> spaceTabChar
+
+backslash :: PandocMonad m => RoffLexer m ()
+backslash = 
+  (mempty <* char '\\') <|> (mempty <* string "\\E")
+
+regularText :: PandocMonad m => RoffLexer m T.Text
+regularText = many1Char $ noneOf "\n\r\t \\\""
+
+quoteChar :: PandocMonad m => RoffLexer m T.Text
+quoteChar = T.singleton <$> char '"'
+
+mdocToken :: PandocMonad m => RoffLexer m MdocTokens
+mdocToken = lexComment <|> lexControlLine <|> lexTextLine
+
+lexMacro :: PandocMonad m => RoffLexer m MdocToken
+lexMacro = do
+  pos <- getPosition
+  name <- many1Char (satisfy isMacroChar)
+  skipSpaces
+  return $ Macro name pos
+  where
+    isMacroChar '%' = True
+    isMacroChar x = isAlphaNum x
+
+lexLit :: PandocMonad m => RoffLexer m MdocToken
+lexLit = do
+  pos <- getPosition
+  t <- mconcat <$> many (argText <|> quotedArg)
+  return $ Lit t pos
+
+lexTextLine :: PandocMonad m => RoffLexer m MdocTokens
+lexTextLine = do
+  pos <- getPosition
+  guard $ sourceColumn pos == 1
+  notFollowedBy $ char '.'
+  t <- mconcat <$> many anyText
+  return $ singleTok $ Str t pos
+
+lexControlLine :: PandocMonad m => RoffLexer m MdocTokens
+lexControlLine = do
+  pos <- getPosition
+  guard $ sourceColumn pos == 1
+  char '.'
+  m <- lexMacro
+  wds <- sepBy (lexLit) spacetab
+  eofline
+  return $ MdocTokens $ Seq.singleton m <> Seq.fromList wds
+
+-- | Tokenize a string as a sequence of roff tokens.
+lexMdoc :: PandocMonad m => SourcePos -> T.Text -> m MdocTokens
+lexMdoc pos txt = do
+  eithertokens <- readWithM (do setPosition pos
+                                mconcat <$> many mdocToken) def txt
+  case eithertokens of
+    Left e       -> throwError e
+    Right tokenz -> return tokenz

From b4bf4f429a188e9051610fc4ccbdfeb1ca5b1197 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Tue, 10 Sep 2024 17:31:46 -0700
Subject: [PATCH 02/81] Start mdoc reader

---
 src/Text/Pandoc/Readers/Mdoc.hs | 232 ++++++++++++++++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100644 src/Text/Pandoc/Readers/Mdoc.hs

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
new file mode 100644
index 000000000000..c19758c1f30e
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -0,0 +1,232 @@
+{-# LANGUAGE FlexibleContexts  #-}
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Mdoc
+   Copyright   : 
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : a
+   Stability   : WIP
+   Portability : portable
+
+Conversion of man to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Mdoc (readMdoc) where
+
+import Data.Default (Default)
+import Control.Monad (mplus, guard, void, when)
+import Control.Monad.Except (throwError)
+import Data.List (intersperse)
+import qualified Data.Text as T
+import Text.Pandoc.Definition (Pandoc(Pandoc), Meta)
+import Text.Pandoc.Builder (Blocks, Inlines)
+import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Class.PandocMonad (PandocMonad(..))
+import Text.Pandoc.Options
+import Text.Pandoc.Parsing
+import Text.Pandoc.Readers.Mdoc.Lex as Lex  -- TODO explicit imports
+import Text.Parsec (modifyState)
+import qualified Text.Pandoc.Parsing as P
+import qualified Data.Foldable as Foldable
+import Text.Pandoc.Shared (stringify)
+
+data MdocSection
+  = ShName
+  | ShSynopsis
+  | ShOther
+  deriving (Show, Eq)
+
+data ManState = ManState { readerOptions   :: ReaderOptions
+                         , metadata        :: Meta
+                         , tableCellsPlain :: Bool
+                         , progName :: Maybe T.Text
+                         , currentSection :: MdocSection
+                         } deriving Show
+
+instance Default ManState where
+  def = ManState { readerOptions   = def
+                 , metadata        = B.nullMeta
+                 , tableCellsPlain = True
+                 , currentSection = ShOther
+                 , progName = Nothing }
+
+type MdocParser m = P.ParsecT [MdocToken] ManState m
+
+
+-- | Read man (troff) from an input string and return a Pandoc document.
+readMdoc :: (PandocMonad m, ToSources a)
+        => ReaderOptions
+        -> a
+        -> m Pandoc
+readMdoc opts s = do
+  let Sources inps = toSources s
+  tokenz <- mconcat <$> mapM (uncurry lexMdoc) inps
+  let state = def {readerOptions = opts} :: ManState
+  eitherdoc <- readWithMTokens parseMdoc state
+     (Foldable.toList . unRoffTokens $ tokenz)
+  either (throwError . fromParsecError (Sources inps)) return eitherdoc
+
+
+readWithMTokens :: PandocMonad m
+        => ParsecT [MdocToken] ManState m a  -- ^ parser
+        -> ManState                         -- ^ initial state
+        -> [MdocToken]                       -- ^ input
+        -> m (Either ParseError a)
+readWithMTokens parser state input =
+  runParserT parser state "source" input
+
+
+parseMdoc :: PandocMonad m => MdocParser m Pandoc
+parseMdoc = do
+  optional parsePrologue
+  trace "hasdf"
+  bs <- many parseBlock <* eof
+  trace "DFDF"
+  meta <- metadata <$> getState
+  let (Pandoc _ blocks) = B.doc $ mconcat bs
+  return $ Pandoc meta blocks
+
+msatisfy :: Monad m
+         => (MdocToken -> Bool) -> P.ParsecT [MdocToken] st m MdocToken
+msatisfy predic = P.tokenPrim show nextPos testTok
+  where
+    testTok t     = if predic t then Just t else Nothing
+    nextPos _ _ (Macro _ pos':_) = pos'
+    nextPos _ _ (Lit _ pos':_) = pos'
+    nextPos _ _ (Str _ pos':_) = pos'
+    nextPos _ _ (Tbl _ _ pos':_) = pos'
+    nextPos a _ (Eol{}:x:xs) = nextPos a x xs
+    nextPos pos _ [Eol] = pos
+    nextPos pos _ [] = pos
+
+macro :: PandocMonad m => T.Text -> MdocParser m MdocToken
+macro name = msatisfy t where
+  t (Macro n _) = n == name
+  t _ = False
+
+emptyMacro :: PandocMonad m => T.Text -> MdocParser m MdocToken
+emptyMacro n = macro n <* eol
+
+str :: PandocMonad m => MdocParser m MdocToken
+str = msatisfy t where
+  t Str{} = True
+  t _ = False
+
+lit :: PandocMonad m => MdocParser m MdocToken
+lit = msatisfy t where
+  t Lit{} = True
+  t _ = False
+
+arg :: PandocMonad m => MdocParser m MdocToken
+arg = msatisfy t where
+  t Lit{} = True
+  t Macro{} = True
+  t _ = False
+
+literal :: PandocMonad m => T.Text -> MdocParser m MdocToken
+literal n = msatisfy t where
+  t (Lit n' _) = n == n'
+  t _ = False
+
+eol :: PandocMonad m => MdocParser m ()
+eol = void $ msatisfy t where
+  t Eol{} = True
+  t _ = False
+
+argsToInlines :: PandocMonad m => MdocParser m Inlines
+argsToInlines = do
+  ls <- manyTill arg eol
+  let strs = map (B.str . toString) ls
+  return $ mconcat $ intersperse B.space strs
+
+parsePrologue :: PandocMonad m => MdocParser m ()
+parsePrologue = do
+  macro "Dd"
+  date <- argsToInlines
+  macro "Dt"
+  (Lit title _) <- lit
+  (Lit section _) <- lit
+  eol
+  emptyMacro "Os"
+  let adjust = B.setMeta "title" (B.str title) . B.setMeta "date" date . B.setMeta "section" (B.str section)
+  modifyState $ \s -> s{metadata = adjust $ metadata s}
+
+shToSectionMode :: T.Text -> MdocSection
+shToSectionMode "NAME" = ShName
+shToSectionMode "SYNOPSIS" = ShSynopsis
+shToSectionMode _ = ShOther
+
+parseHeader :: PandocMonad m => MdocParser m Blocks
+parseHeader = do
+  (Macro m _) <- macro "Sh" <|> macro "Ss"
+  txt <- argsToInlines
+  let lvl = if m == "Sh" then 1 else 2
+  when (lvl == 1) $ modifyState $ \s -> s{currentSection = (shToSectionMode . stringify) txt}
+  return $ B.header lvl txt
+
+parseNameSection :: PandocMonad m => MdocParser m Blocks
+parseNameSection = do
+  sec <- currentSection <$> getState
+  guard $ sec == ShName
+  macro "Nm" <|> macro "Fn"
+  pname <- toString <$> arg
+  -- TODO multiple Nm macros and delimiters
+  eol
+  macro "Nd"
+  desc <- argsToInlines
+  modifyState $ \s -> s{progName = mplus (progName s) (Just pname)}
+  skipMany (msatisfy (const True))
+  return $ B.para $ B.code pname <> B.space <> "—" <> B.space <> desc
+
+parseSynopsisSection :: PandocMonad m => MdocParser m Blocks
+parseSynopsisSection = do
+  sec <- currentSection <$> getState
+  guard $ sec == ShSynopsis
+  return mempty
+
+parseStr :: PandocMonad m => MdocParser m Inlines
+parseStr = do
+  (Str txt _) <- str
+  return $ B.str txt
+
+parseLit :: PandocMonad m => MdocParser m Inlines
+parseLit = do
+  (Lit txt _) <- lit
+  return $ B.str txt
+
+simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines
+simpleInline nm xform = do
+  macro nm
+  -- inlines <- mconcat <$> many1 parseLit
+  return $ xform $ B.text "go nuts"
+
+-- Sy: callable, parsed, >0 arguments
+parseSy :: PandocMonad m => MdocParser m Inlines
+parseSy = trace "SSS" >> simpleInline "Sy" B.strong
+
+parseInlineMacro :: PandocMonad m => MdocParser m Inlines
+parseInlineMacro = choice [ parseSy ]
+
+parseInline :: PandocMonad m => MdocParser m Inlines
+parseInline = parseStr <|> (parseInlineMacro <* eol)
+
+parseInlines :: PandocMonad m => MdocParser m Inlines
+parseInlines = mconcat <$> many1 parseInline
+
+parsePara :: PandocMonad m => MdocParser m Blocks
+parsePara = B.para . B.trimInlines <$> parseInlines
+
+parseBlock :: PandocMonad m => MdocParser m Blocks
+parseBlock = trace "DFDF" >> choice [ -- parseList
+                    -- , parseDefinitionList
+                    parseHeader
+                    , parseNameSection
+                    , parseSynopsisSection
+                    , parsePara
+                    -- , parseTable
+                    --, parseCodeBlock
+                    -- , parseBlockQuote
+                    -- , parseNewParagraph
+                    -- , skipUnknownMacro
+                    ]
+

From 419ec786c4b3865246b6489937c043b956e165f4 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Tue, 10 Sep 2024 19:43:26 -0700
Subject: [PATCH 03/81] Register the mdoc reader

---
 src/Text/Pandoc/Readers.hs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index ba5170a10b56..2efd061dffed 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -51,6 +51,7 @@ module Text.Pandoc.Readers
   , readEPUB
   , readMuse
   , readMan
+  , readMdoc
   , readFB2
   , readIpynb
   , readCSV
@@ -106,6 +107,7 @@ import Text.Pandoc.Readers.TWiki
 import Text.Pandoc.Readers.Txt2Tags
 import Text.Pandoc.Readers.Vimwiki
 import Text.Pandoc.Readers.Man
+import Text.Pandoc.Readers.Mdoc
 import Text.Pandoc.Readers.CSV
 import Text.Pandoc.Readers.CslJson
 import Text.Pandoc.Readers.BibTeX
@@ -168,6 +170,7 @@ readers = [("native"       , TextReader readNative)
           ,("rtf"          , TextReader readRTF)
           ,("typst"        , TextReader readTypst)
           ,("djot"         , TextReader readDjot)
+          ,("mdoc"         , TextReader readMdoc)
           ]
 
 -- | Retrieve reader, extensions based on format spec (format+extensions).

From f44074e26b5dc1f3a19e27ad77d83d11fa65a670 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Tue, 10 Sep 2024 19:45:46 -0700
Subject: [PATCH 04/81] Checkpoint

I'll try to start doing real commits for myself from now on
---
 src/Text/Pandoc/Readers/Mdoc.hs     | 26 ++++++++++++++++---
 src/Text/Pandoc/Readers/Mdoc/Lex.hs | 40 +++++++++++++++++++++++------
 2 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index c19758c1f30e..cef5688839ec 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -94,6 +94,7 @@ msatisfy predic = P.tokenPrim show nextPos testTok
     nextPos _ _ (Macro _ pos':_) = pos'
     nextPos _ _ (Lit _ pos':_) = pos'
     nextPos _ _ (Str _ pos':_) = pos'
+    nextPos _ _ (Delim _ _ pos':_) = pos'
     nextPos _ _ (Tbl _ _ pos':_) = pos'
     nextPos a _ (Eol{}:x:xs) = nextPos a x xs
     nextPos pos _ [Eol] = pos
@@ -107,6 +108,11 @@ macro name = msatisfy t where
 emptyMacro :: PandocMonad m => T.Text -> MdocParser m MdocToken
 emptyMacro n = macro n <* eol
 
+delim :: PandocMonad m => DelimSide -> MdocParser m MdocToken
+delim side = msatisfy t where
+  t (Delim s _ _) = side == s
+  t _ = False
+
 str :: PandocMonad m => MdocParser m MdocToken
 str = msatisfy t where
   t Str{} = True
@@ -194,15 +200,29 @@ parseLit = do
   (Lit txt _) <- lit
   return $ B.str txt
 
+parseDelim :: PandocMonad m => DelimSide -> MdocParser m Inlines
+parseDelim pos = do
+  (Delim _ txt _) <- delim pos
+  return $ B.str txt
+
+litsToText :: PandocMonad m => MdocParser m Inlines
+litsToText = do
+  ls <- many lit
+  let strs = map (B.str . toString) ls
+  return $ mconcat $ intersperse B.space strs
+
 simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines
 simpleInline nm xform = do
   macro nm
-  -- inlines <- mconcat <$> many1 parseLit
-  return $ xform $ B.text "go nuts"
+  openDelim <- mconcat <$> many (parseDelim Open)
+  inlines <- litsToText
+  closeDelim <- mconcat <$> many (parseDelim Close)
+  return $ openDelim <> xform inlines <> closeDelim
+
 
 -- Sy: callable, parsed, >0 arguments
 parseSy :: PandocMonad m => MdocParser m Inlines
-parseSy = trace "SSS" >> simpleInline "Sy" B.strong
+parseSy = simpleInline "Sy" B.strong
 
 parseInlineMacro :: PandocMonad m => MdocParser m Inlines
 parseInlineMacro = choice [ parseSy ]
diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
index ad380a9b973c..7cd8bb05274a 100644
--- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -16,7 +16,9 @@ Tokenizer for roff formats (man, ms).
 module Text.Pandoc.Readers.Mdoc.Lex
   ( MdocToken(..)
   , MdocTokens(..)
+  , DelimSide(..)
   , lexMdoc
+  , toString
   )
 where
 
@@ -57,12 +59,24 @@ data CellFormat =
 
 type TableRow = ([CellFormat], [MdocTokens])
 
+data DelimSide = Open | Middle | Close deriving (Show, Eq)
+
 data MdocToken = Str T.Text SourcePos
                | Macro T.Text SourcePos
                | Lit T.Text SourcePos
+               | Delim DelimSide T.Text SourcePos
                | Tbl [TableOption] [TableRow] SourcePos
+               | Eol
                deriving Show
 
+toString :: MdocToken -> T.Text
+toString (Str x _) = x
+toString (Macro x _) = x
+toString (Lit x _) = x
+toString (Delim _ x _) = x
+toString Tbl{} = mempty
+toString Eol = mempty
+
 newtype MdocTokens = MdocTokens { unRoffTokens :: Seq.Seq MdocToken }
         deriving (Show, Semigroup, Monoid)
 
@@ -82,8 +96,10 @@ type RoffLexer m = ParsecT Sources RoffState m
 -- Lexer: T.Text -> RoffToken
 --
 
-eofline :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m ()
-eofline = void newline <|> eof
+eofline :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m MdocToken
+eofline = do
+  void newline <|> eof
+  return Eol
 
 spacetab :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m Char
 spacetab = char ' ' <|> char '\t'
@@ -297,18 +313,25 @@ lexMacro = do
     isMacroChar '%' = True
     isMacroChar x = isAlphaNum x
 
+lexDelim :: PandocMonad m => RoffLexer m MdocToken
+lexDelim = do
+  pos <- getPosition
+  t <- Delim Open <$> oneOfStrings ["(", "["] <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"]
+  return $ t pos
+
 lexLit :: PandocMonad m => RoffLexer m MdocToken
 lexLit = do
   pos <- getPosition
-  t <- mconcat <$> many (argText <|> quotedArg)
+  t <- argText <|> quotedArg
+  guard $ not $ T.null t
   return $ Lit t pos
 
 lexTextLine :: PandocMonad m => RoffLexer m MdocTokens
 lexTextLine = do
   pos <- getPosition
   guard $ sourceColumn pos == 1
-  notFollowedBy $ char '.'
   t <- mconcat <$> many anyText
+  eofline
   return $ singleTok $ Str t pos
 
 lexControlLine :: PandocMonad m => RoffLexer m MdocTokens
@@ -317,15 +340,16 @@ lexControlLine = do
   guard $ sourceColumn pos == 1
   char '.'
   m <- lexMacro
-  wds <- sepBy (lexLit) spacetab
-  eofline
-  return $ MdocTokens $ Seq.singleton m <> Seq.fromList wds
+  wds <- sepBy (lexDelim <|> lexLit) spacetab
+  skipSpaces
+  e <- eofline
+  return $ MdocTokens $ Seq.fromList $ (m:wds) <> [e]
 
 -- | Tokenize a string as a sequence of roff tokens.
 lexMdoc :: PandocMonad m => SourcePos -> T.Text -> m MdocTokens
 lexMdoc pos txt = do
   eithertokens <- readWithM (do setPosition pos
-                                mconcat <$> many mdocToken) def txt
+                                mconcat <$> manyTill mdocToken eof) def txt
   case eithertokens of
     Left e       -> throwError e
     Right tokenz -> return tokenz

From b301ee08692df0894d31f68d228bc75933c45f6e Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Tue, 10 Sep 2024 23:23:00 -0700
Subject: [PATCH 05/81] stash pandoc.cabal nopub

---
 pandoc.cabal | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pandoc.cabal b/pandoc.cabal
index 838ad674ec21..bd986ee84b38 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -547,6 +547,7 @@ library
   hs-source-dirs:  src
 
   exposed-modules: Text.Pandoc,
+                   Text.Pandoc.Readers.Roff,
                    Text.Pandoc.App,
                    Text.Pandoc.Data,
                    Text.Pandoc.Options,
@@ -588,6 +589,8 @@ library
                    Text.Pandoc.Readers.EPUB,
                    Text.Pandoc.Readers.Muse,
                    Text.Pandoc.Readers.Man,
+                   Text.Pandoc.Readers.Mdoc,
+                   Text.Pandoc.Readers.Mdoc.Lex,
                    Text.Pandoc.Readers.FB2,
                    Text.Pandoc.Readers.DokuWiki,
                    Text.Pandoc.Readers.Ipynb,
@@ -722,7 +725,6 @@ library
                    Text.Pandoc.Readers.Org.Parsing,
                    Text.Pandoc.Readers.Org.Shared,
                    Text.Pandoc.Readers.Metadata,
-                   Text.Pandoc.Readers.Roff,
                    Text.Pandoc.Writers.Docx.OpenXML,
                    Text.Pandoc.Writers.Docx.StyleMap,
                    Text.Pandoc.Writers.Docx.Table,
@@ -858,3 +860,8 @@ benchmark benchmark-pandoc
                    deepseq
   -- we increase heap size to avoid benchmarking garbage collection:
   ghc-options:     -rtsopts -with-rtsopts=-A8m -threaded
+
+executable lexroff
+  import:  common-executable
+  main-is: lexroff.hs
+  build-depends: pandoc, text

From fe1659dae52d26decceaa3c21e97f47e6f906140 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Wed, 11 Sep 2024 12:19:05 -0700
Subject: [PATCH 06/81] Use standard spaceChar

Replacing spacetab copied from Roff lexer
---
 src/Text/Pandoc/Readers/Mdoc/Lex.hs | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
index 7cd8bb05274a..4de43db65e54 100644
--- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -101,9 +101,6 @@ eofline = do
   void newline <|> eof
   return Eol
 
-spacetab :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m Char
-spacetab = char ' ' <|> char '\t'
-
 characterCodeMap :: M.Map T.Text Char
 characterCodeMap =
   M.fromList $ map (\(x,y) -> (y,x)) characterCodes
@@ -273,7 +270,7 @@ argText :: PandocMonad m => RoffLexer m T.Text
 argText = mconcat <$> many1 (escape <|> regularText)
 
 spaceTabChar :: PandocMonad m => RoffLexer m T.Text
-spaceTabChar = T.singleton <$> spacetab
+spaceTabChar = T.singleton <$> spaceChar
 
 quotedArg :: PandocMonad m => RoffLexer m T.Text
 quotedArg = do

From 35900e9868ae117df8a4a3c85c07b852ac3a62d7 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Wed, 11 Sep 2024 12:20:42 -0700
Subject: [PATCH 07/81] Consume spaces when lexing mdoc control tokens

---
 src/Text/Pandoc/Readers/Mdoc/Lex.hs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
index 4de43db65e54..9301841e24ad 100644
--- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -314,6 +314,7 @@ lexDelim :: PandocMonad m => RoffLexer m MdocToken
 lexDelim = do
   pos <- getPosition
   t <- Delim Open <$> oneOfStrings ["(", "["] <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"]
+  skipSpaces
   return $ t pos
 
 lexLit :: PandocMonad m => RoffLexer m MdocToken
@@ -321,6 +322,7 @@ lexLit = do
   pos <- getPosition
   t <- argText <|> quotedArg
   guard $ not $ T.null t
+  skipSpaces
   return $ Lit t pos
 
 lexTextLine :: PandocMonad m => RoffLexer m MdocTokens
@@ -337,9 +339,7 @@ lexControlLine = do
   guard $ sourceColumn pos == 1
   char '.'
   m <- lexMacro
-  wds <- sepBy (lexDelim <|> lexLit) spacetab
-  skipSpaces
-  e <- eofline
+  (wds, e) <- manyUntil (lexDelim <|> lexLit) eofline
   return $ MdocTokens $ Seq.fromList $ (m:wds) <> [e]
 
 -- | Tokenize a string as a sequence of roff tokens.

From 50383346eae1e4c9deffa03cf11cf1d3a89bdbcb Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Wed, 11 Sep 2024 12:27:10 -0700
Subject: [PATCH 08/81] Backtrack delim lexing

---
 src/Text/Pandoc/Readers/Mdoc/Lex.hs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
index 9301841e24ad..b82fdeb2da0c 100644
--- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -314,6 +314,7 @@ lexDelim :: PandocMonad m => RoffLexer m MdocToken
 lexDelim = do
   pos <- getPosition
   t <- Delim Open <$> oneOfStrings ["(", "["] <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"]
+  eof <|> void (lookAhead (spaceChar <|> newline))
   skipSpaces
   return $ t pos
 
@@ -339,7 +340,7 @@ lexControlLine = do
   guard $ sourceColumn pos == 1
   char '.'
   m <- lexMacro
-  (wds, e) <- manyUntil (lexDelim <|> lexLit) eofline
+  (wds, e) <- manyUntil (try lexDelim <|> lexLit) eofline
   return $ MdocTokens $ Seq.fromList $ (m:wds) <> [e]
 
 -- | Tokenize a string as a sequence of roff tokens.

From fd2ee1bf6d5160e5f8c6876e0293cf82edc48944 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Wed, 11 Sep 2024 14:40:00 -0700
Subject: [PATCH 09/81] Delete some traces

---
 src/Text/Pandoc/Readers/Mdoc.hs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index cef5688839ec..222ac633e876 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -79,9 +79,7 @@ readWithMTokens parser state input =
 parseMdoc :: PandocMonad m => MdocParser m Pandoc
 parseMdoc = do
   optional parsePrologue
-  trace "hasdf"
   bs <- many parseBlock <* eof
-  trace "DFDF"
   meta <- metadata <$> getState
   let (Pandoc _ blocks) = B.doc $ mconcat bs
   return $ Pandoc meta blocks
@@ -237,7 +235,7 @@ parsePara :: PandocMonad m => MdocParser m Blocks
 parsePara = B.para . B.trimInlines <$> parseInlines
 
 parseBlock :: PandocMonad m => MdocParser m Blocks
-parseBlock = trace "DFDF" >> choice [ -- parseList
+parseBlock = choice [ -- parseList
                     -- , parseDefinitionList
                     parseHeader
                     , parseNameSection

From d88e71df308ceb0cc5a7662f738c119132993a10 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Wed, 11 Sep 2024 15:16:55 -0700
Subject: [PATCH 10/81] Remove skip to end

---
 src/Text/Pandoc/Readers/Mdoc.hs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 222ac633e876..b0eadd2055d3 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -179,7 +179,6 @@ parseNameSection = do
   macro "Nd"
   desc <- argsToInlines
   modifyState $ \s -> s{progName = mplus (progName s) (Just pname)}
-  skipMany (msatisfy (const True))
   return $ B.para $ B.code pname <> B.space <> "—" <> B.space <> desc
 
 parseSynopsisSection :: PandocMonad m => MdocParser m Blocks

From eec517811b557e006c87dfa0824b845649730f43 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Wed, 11 Sep 2024 16:59:51 -0700
Subject: [PATCH 11/81] Lex blank mdoc lines

mandoc's roff(7) says "Blank text lines, which may include whitespace,
are only permitted within literal contexts." mandoc -T lint warns about
blank lines and inserts a roff `sp` request, which is handled
differently depending on the output format. My read is that mandoc
considers the handling of a blank line in non-literal context in mdoc(7)
to be undefined.
---
 src/Text/Pandoc/Readers/Mdoc/Lex.hs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
index b82fdeb2da0c..88e7ea4421bf 100644
--- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -64,6 +64,7 @@ data DelimSide = Open | Middle | Close deriving (Show, Eq)
 data MdocToken = Str T.Text SourcePos
                | Macro T.Text SourcePos
                | Lit T.Text SourcePos
+               | Blank SourcePos
                | Delim DelimSide T.Text SourcePos
                | Tbl [TableOption] [TableRow] SourcePos
                | Eol
@@ -74,6 +75,7 @@ toString (Str x _) = x
 toString (Macro x _) = x
 toString (Lit x _) = x
 toString (Delim _ x _) = x
+toString Blank{} = mempty
 toString Tbl{} = mempty
 toString Eol = mempty
 
@@ -332,7 +334,9 @@ lexTextLine = do
   guard $ sourceColumn pos == 1
   t <- mconcat <$> many anyText
   eofline
-  return $ singleTok $ Str t pos
+  if T.null $ T.strip t
+     then return $ singleTok $ Blank pos
+     else return $ singleTok $ Str t pos
 
 lexControlLine :: PandocMonad m => RoffLexer m MdocTokens
 lexControlLine = do

From f4383b950a1e2669990cab1071bc06cf826b4865 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Wed, 11 Sep 2024 17:14:34 -0700
Subject: [PATCH 12/81] Handle blanks in mdoc parser

---
 src/Text/Pandoc/Readers/Mdoc.hs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index b0eadd2055d3..d2cd392204bf 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -93,6 +93,7 @@ msatisfy predic = P.tokenPrim show nextPos testTok
     nextPos _ _ (Lit _ pos':_) = pos'
     nextPos _ _ (Str _ pos':_) = pos'
     nextPos _ _ (Delim _ _ pos':_) = pos'
+    nextPos _ _ (Blank pos':_) = pos'
     nextPos _ _ (Tbl _ _ pos':_) = pos'
     nextPos a _ (Eol{}:x:xs) = nextPos a x xs
     nextPos pos _ [Eol] = pos
@@ -132,6 +133,11 @@ literal n = msatisfy t where
   t (Lit n' _) = n == n'
   t _ = False
 
+blank :: PandocMonad m => MdocParser m MdocToken
+blank = msatisfy t where
+  t Blank{} = True
+  t _ = False
+
 eol :: PandocMonad m => MdocParser m ()
 eol = void $ msatisfy t where
   t Eol{} = True
@@ -233,6 +239,9 @@ parseInlines = mconcat <$> many1 parseInline
 parsePara :: PandocMonad m => MdocParser m Blocks
 parsePara = B.para . B.trimInlines <$> parseInlines
 
+skipBlanks :: PandocMonad m => MdocParser m Blocks
+skipBlanks = many1 blank *> mempty
+
 parseBlock :: PandocMonad m => MdocParser m Blocks
 parseBlock = choice [ -- parseList
                     -- , parseDefinitionList
@@ -242,6 +251,7 @@ parseBlock = choice [ -- parseList
                     , parsePara
                     -- , parseTable
                     --, parseCodeBlock
+                    , skipBlanks
                     -- , parseBlockQuote
                     -- , parseNewParagraph
                     -- , skipUnknownMacro

From 353943cc46ef28d103b3f4c183e261d8803f1c12 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Wed, 11 Sep 2024 17:18:45 -0700
Subject: [PATCH 13/81] Comment upon parseStr's output

---
 src/Text/Pandoc/Readers/Mdoc.hs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index d2cd392204bf..94487e215ac5 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -193,6 +193,9 @@ parseSynopsisSection = do
   guard $ sec == ShSynopsis
   return mempty
 
+-- parseStr doesn't use B.text because roff(7) specifies that
+-- whitespace in text lines is treated literally.
+-- XXX is this what we actually want?
 parseStr :: PandocMonad m => MdocParser m Inlines
 parseStr = do
   (Str txt _) <- str

From 5a56e5d076dc7f03082d3443ac41791fc4673726 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Wed, 11 Sep 2024 17:37:47 -0700
Subject: [PATCH 14/81] Remove unused/redundant imports

Copy-pasted. Maybe they'll come back.
---
 src/Text/Pandoc/Readers/Mdoc/Lex.hs | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
index 88e7ea4421bf..4eacfc391e74 100644
--- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -1,7 +1,6 @@
 {-# LANGUAGE FlexibleContexts #-}
 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
 {-# LANGUAGE OverloadedStrings #-}
-{-# LANGUAGE ViewPatterns #-}
 {- |
    Module      : Text.Pandoc.Readers.Mdoc.Lex
    Copyright   : Copyright (C) 2018-2020 Yan Pashkovsky and John MacFarlane
@@ -22,15 +21,12 @@ module Text.Pandoc.Readers.Mdoc.Lex
   )
 where
 
-import Safe (lastDef)
 import Control.Monad (void, mzero, mplus, guard)
 import Control.Monad.Except (throwError)
-import Text.Pandoc.Class.PandocMonad
-       (getResourcePath, readFileFromDirs, PandocMonad(..), report)
-import Data.Char (isLower, toLower, toUpper, chr, isAscii, isAlphaNum)
+import Text.Pandoc.Class.PandocMonad (PandocMonad(..), report)
+import Data.Char (chr, isAscii, isAlphaNum)
 import Data.Default (Default)
 import qualified Data.Map as M
-import Data.List (intercalate)
 import qualified Data.Text as T
 import Text.Pandoc.Logging (LogMessage(..))
 import Text.Pandoc.Options
@@ -38,7 +34,6 @@ import Text.Pandoc.Parsing
 import Text.Pandoc.Shared (safeRead)
 import Text.Pandoc.RoffChar (characterCodes, combiningAccents)
 import qualified Data.Sequence as Seq
-import qualified Data.Foldable as Foldable
 import qualified Data.Text.Normalize as Normalize
 
 -- import Debug.Trace (traceShowId)

From 754ffc85f5b433fc3b8f5effdc6d416610580f84 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Wed, 11 Sep 2024 19:40:43 -0700
Subject: [PATCH 15/81] Make more progress on inlines/paras

---
 src/Text/Pandoc/Readers/Mdoc.hs | 41 ++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 94487e215ac5..2bfd6f3f136d 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -143,6 +143,16 @@ eol = void $ msatisfy t where
   t Eol{} = True
   t _ = False
 
+inlineContextEnd :: PandocMonad m => MdocParser m ()
+inlineContextEnd = eof <|> (void . lookAhead $ msatisfy t) where
+  t Eol{} = True
+  t Macro{} = True
+  t Str{} = True -- shouldn't be lexed
+  t Tbl{} = True -- shouldn't be lexed
+  t Blank{} = True -- shouldn't be lexed
+  t Lit{} = False
+  t Delim{} = False
+
 argsToInlines :: PandocMonad m => MdocParser m Inlines
 argsToInlines = do
   ls <- manyTill arg eol
@@ -213,34 +223,49 @@ parseDelim pos = do
 
 litsToText :: PandocMonad m => MdocParser m Inlines
 litsToText = do
-  ls <- many lit
+  ls <- many1 lit
   let strs = map (B.str . toString) ls
   return $ mconcat $ intersperse B.space strs
 
 simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines
 simpleInline nm xform = do
   macro nm
-  openDelim <- mconcat <$> many (parseDelim Open)
-  inlines <- litsToText
-  closeDelim <- mconcat <$> many (parseDelim Close)
-  return $ openDelim <> xform inlines <> closeDelim
+  segs <- manyTill segment inlineContextEnd
+  return $ mconcat $ intersperse B.space segs
+ where
+   segment = do
+      openDelim <- mconcat <$> many (parseDelim Open)
+      inlines <- option mempty litsToText
+      closeDelim <- mconcat <$> many (parseDelim Close)
+      let xform' x = if null x then mempty else xform x
+      return $ openDelim <> xform' inlines <> closeDelim
 
 
 -- Sy: callable, parsed, >0 arguments
+-- mandoc -T html formats Sy with a <b> tag, since it's not really
+-- semantically <strong>, but Strong is our best option in Pandoc
 parseSy :: PandocMonad m => MdocParser m Inlines
 parseSy = simpleInline "Sy" B.strong
 
+parseEm :: PandocMonad m => MdocParser m Inlines
+parseEm = simpleInline "Em" B.emph
+
 parseInlineMacro :: PandocMonad m => MdocParser m Inlines
-parseInlineMacro = choice [ parseSy ]
+parseInlineMacro = choice [ parseSy, parseEm ]
 
+-- TODO this doesn't handle inline macros being interrupted
+-- by other ones yet, but the lexer doesn't handle it yet
+-- either
 parseInline :: PandocMonad m => MdocParser m Inlines
 parseInline = parseStr <|> (parseInlineMacro <* eol)
 
 parseInlines :: PandocMonad m => MdocParser m Inlines
-parseInlines = mconcat <$> many1 parseInline
+parseInlines = mconcat . intersperse B.space <$> many1 parseInline
 
 parsePara :: PandocMonad m => MdocParser m Blocks
-parsePara = B.para . B.trimInlines <$> parseInlines
+parsePara = do
+  optional (emptyMacro "Pp")
+  B.para . B.trimInlines <$> parseInlines
 
 skipBlanks :: PandocMonad m => MdocParser m Blocks
 skipBlanks = many1 blank *> mempty

From e1eddf630dfe8cf147fec4ba77f3974bfd02c129 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Wed, 11 Sep 2024 19:42:53 -0700
Subject: [PATCH 16/81] Add parsing of code blocks from mdoc

---
 src/Text/Pandoc/Readers/Mdoc.hs | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 2bfd6f3f136d..218130c9934d 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -267,6 +267,22 @@ parsePara = do
   optional (emptyMacro "Pp")
   B.para . B.trimInlines <$> parseInlines
 
+-- CodeBlocks can't contain any other markup, but mdoc
+-- still interprets control lines within .Bd -literal
+-- blocks. Just ignoring this for now and failing if
+-- we get any control lines inside a Bd literal
+parseCodeBlock :: PandocMonad m => MdocParser m Blocks
+parseCodeBlock = do
+  macro "Bd" -- TODO will need to hoist
+  literal "-literal"
+  optional (literal "-offset" *> lit)
+  optional (literal "-compact")
+  eol
+  l <- T.unlines . map toString <$> many (str <|> blank)
+  emptyMacro "Ed"
+  return $ B.codeBlock l
+
+
 skipBlanks :: PandocMonad m => MdocParser m Blocks
 skipBlanks = many1 blank *> mempty
 
@@ -278,7 +294,7 @@ parseBlock = choice [ -- parseList
                     , parseSynopsisSection
                     , parsePara
                     -- , parseTable
-                    --, parseCodeBlock
+                    , parseCodeBlock
                     , skipBlanks
                     -- , parseBlockQuote
                     -- , parseNewParagraph

From 9e44bb6d40d1cf3ad25d2d4d7d460a33365783b4 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 12 Sep 2024 00:40:51 -0700
Subject: [PATCH 17/81] tktk: Unify roff escapes

---
 pandoc.cabal                           |   1 +
 src/Text/Pandoc/Readers/Mdoc/Lex.hs    | 215 +++-----------------
 src/Text/Pandoc/Readers/Roff.hs        | 271 +++++--------------------
 src/Text/Pandoc/Readers/Roff/Escape.hs | 198 ++++++++++++++++++
 4 files changed, 280 insertions(+), 405 deletions(-)
 create mode 100644 src/Text/Pandoc/Readers/Roff/Escape.hs

diff --git a/pandoc.cabal b/pandoc.cabal
index bd986ee84b38..b885c3db52f1 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -724,6 +724,7 @@ library
                    Text.Pandoc.Readers.Org.ParserState,
                    Text.Pandoc.Readers.Org.Parsing,
                    Text.Pandoc.Readers.Org.Shared,
+                   Text.Pandoc.Readers.Roff.Escape,
                    Text.Pandoc.Readers.Metadata,
                    Text.Pandoc.Writers.Docx.OpenXML,
                    Text.Pandoc.Writers.Docx.StyleMap,
diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
index 4eacfc391e74..03e3f999aa35 100644
--- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -1,6 +1,9 @@
 {-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
 {-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE TypeFamilies #-}
 {- |
    Module      : Text.Pandoc.Readers.Mdoc.Lex
    Copyright   : Copyright (C) 2018-2020 Yan Pashkovsky and John MacFarlane
@@ -21,20 +24,16 @@ module Text.Pandoc.Readers.Mdoc.Lex
   )
 where
 
-import Control.Monad (void, mzero, mplus, guard)
+import Control.Monad (void, guard)
 import Control.Monad.Except (throwError)
-import Text.Pandoc.Class.PandocMonad (PandocMonad(..), report)
-import Data.Char (chr, isAscii, isAlphaNum)
+import Text.Pandoc.Class.PandocMonad (PandocMonad(..))
+import Data.Char (isAlphaNum)
 import Data.Default (Default)
-import qualified Data.Map as M
 import qualified Data.Text as T
-import Text.Pandoc.Logging (LogMessage(..))
 import Text.Pandoc.Options
 import Text.Pandoc.Parsing
-import Text.Pandoc.Shared (safeRead)
-import Text.Pandoc.RoffChar (characterCodes, combiningAccents)
+import Text.Pandoc.Readers.Roff.Escape
 import qualified Data.Sequence as Seq
-import qualified Data.Text.Normalize as Normalize
 
 -- import Debug.Trace (traceShowId)
 
@@ -80,14 +79,20 @@ newtype MdocTokens = MdocTokens { unRoffTokens :: Seq.Seq MdocToken }
 singleTok :: MdocToken -> MdocTokens
 singleTok t = MdocTokens (Seq.singleton t)
 
-data RoffState = RoffState { tableTabChar :: Char
-                           } deriving Show
+data RoffState = RoffState  deriving Show
 
 instance Default RoffState where
-  def = RoffState { tableTabChar = '\t'
-                  }
+  def = RoffState
 
-type RoffLexer m = ParsecT Sources RoffState m
+type Lexer m = ParsecT Sources RoffState m
+
+instance RoffMonad MdocTokens where
+  type Token MdocTokens = T.Text
+  type State MdocTokens = RoffState
+  expandString = return ()
+  escString = return mempty
+  emit = id
+  backslash = (mempty <* char '\\') <|> (mempty <* string "\\E")
 
 --
 -- Lexer: T.Text -> RoffToken
@@ -98,164 +103,10 @@ eofline = do
   void newline <|> eof
   return Eol
 
-characterCodeMap :: M.Map T.Text Char
-characterCodeMap =
-  M.fromList $ map (\(x,y) -> (y,x)) characterCodes
-
-combiningAccentsMap :: M.Map T.Text Char
-combiningAccentsMap =
-  M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
-
-escape :: PandocMonad m => RoffLexer m T.Text
-escape = try $ do
-  backslash
-  escapeGlyph <|> escapeNormal
-
-escapeGlyph :: PandocMonad m => RoffLexer m T.Text
-escapeGlyph = do
-  c <- lookAhead (oneOf ['[','('])
-  escapeArg >>= resolveGlyph c
-
-resolveGlyph :: PandocMonad m => Char -> T.Text -> RoffLexer m T.Text
-resolveGlyph delimChar glyph = do
-  let cs = T.replace "_u" " u" glyph -- unicode glyphs separated by _
-  (case T.words cs of
-      []  -> mzero
-      [s] -> case M.lookup s characterCodeMap `mplus` readUnicodeChar s of
-               Nothing -> mzero
-               Just c  -> return $ T.singleton c
-      (s:ss) -> do
-        basechar <- case M.lookup s characterCodeMap `mplus`
-                         readUnicodeChar s of
-                      Nothing ->
-                        case T.unpack s of
-                          [ch] | isAscii ch && isAlphaNum ch ->
-                                 return ch
-                          _ -> mzero
-                      Just c  -> return c
-        let addAccents [] xs = return $ Normalize.normalize Normalize.NFC $
-                                        T.reverse xs
-            addAccents (a:as) xs =
-              case M.lookup a combiningAccentsMap `mplus` readUnicodeChar a of
-                Just x  -> addAccents as $ T.cons x xs
-                Nothing -> mzero
-        addAccents ss (T.singleton basechar) >>= \xs -> return xs)
-      <|> case delimChar of
-            '['  -> escUnknown ("\\[" <> glyph <> "]")
-            '('  -> escUnknown ("\\(" <> glyph)
-            '\'' -> escUnknown ("\\C'" <> glyph <> "'")
-            _    -> Prelude.fail "resolveGlyph: unknown glyph delimiter"
-
-readUnicodeChar :: T.Text -> Maybe Char
-readUnicodeChar t = case T.uncons t of
-  Just ('u', cs) | T.length cs > 3 -> chr <$> safeRead ("0x" <> cs)
-  _ -> Nothing
-
-escapeNormal :: PandocMonad m => RoffLexer m T.Text
-escapeNormal = do
-  c <- noneOf "{}"
-  let groffSkip = [escapeArg, countChar 1 (satisfy (/='\n'))]
-  case c of
-    ' ' -> return " " -- mandoc_char(7) says this should be a nonbreaking space
-    '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
-    '#' -> mempty <$ manyTill anyChar newline
-    '%' -> return mempty  -- optional hyphenation
-    '&' -> return mempty  -- nonprintable zero-width
-    ')' -> return mempty  -- nonprintable zero-width
-    '*' -> escIgnore '*' groffSkip
-    ',' -> return mempty  -- to fix spacing after roman
-    '-' -> return "-"
-    '.' -> return "."
-    '/' -> return mempty  -- to fix spacing before roman
-    '0' -> return "\x2007" -- digit-width space
-    ':' -> return mempty  -- zero-width break
-    'A' -> escIgnore 'A' [quoteArg]
-    'B' -> escIgnore 'B' [quoteArg]
-    'C' -> quoteArg >>= resolveGlyph '\''
-    'D' -> escIgnore 'D' [quoteArg]
-    'F' -> escIgnore 'F' groffSkip
-    'H' -> escIgnore 'H' [quoteArg]
-    'L' -> escIgnore 'L' [quoteArg]
-    'M' -> escIgnore 'M' groffSkip
-    'N' -> escIgnore 'N' [quoteArg]
-    'O' -> escIgnore 'O' groffSkip
-    'R' -> escIgnore 'R' [quoteArg]
-    'S' -> escIgnore 'S' [quoteArg]
-    'V' -> escIgnore 'V' groffSkip
-    'X' -> escIgnore 'X' [quoteArg]
-    'Y' -> escIgnore 'Y' groffSkip
-    'Z' -> escIgnore 'Z' [quoteArg]
-    '\'' -> return "'"
-    '\n' -> return mempty  -- line continuation
-    '^' -> return "\x200A" -- 1/12 em space
-    '_' -> return "_"
-    '`' -> return "`"
-    'a' -> return mempty  -- "non-interpreted leader character"
-    'b' -> escIgnore 'b' [quoteArg]
-    'c' -> return mempty  -- interrupt text processing
-    'd' -> escIgnore 'd' [] -- forward down 1/2em
-    'e' -> return "\\"
-    'f' -> escIgnore 'f' groffSkip
-    'g' -> escIgnore 'g' groffSkip
-    'h' -> escIgnore 'h' [quoteArg]
-    'k' -> escIgnore 'k' groffSkip
-    'l' -> escIgnore 'l' [quoteArg]
-    'm' -> escIgnore 'm' groffSkip
-    'n' -> escIgnore 'm' groffSkip
-    'o' -> escIgnore 'o' [quoteArg]
-    'p' -> escIgnore 'p' []
-    'r' -> escIgnore 'r' []
-    's' -> escIgnore 's' [escapeArg, signedNumber]
-    't' -> return "\t"
-    'u' -> escIgnore 'u' []
-    'v' -> escIgnore 'v' [quoteArg]
-    'w' -> escIgnore 'w' [quoteArg]
-    'x' -> escIgnore 'x' [quoteArg]
-    'z' -> escIgnore 'z' [countChar 1 anyChar]
-    '|' -> return "\x2006" --1/6 em space
-    '~' -> return "\160" -- nonbreaking space
-    '\\' -> return "\\"
-    _   -> return $ T.singleton c
-    -- man 7 groff: "If  a  backslash  is followed by a character that
-    -- does not constitute a defined escape sequence, the backslash
-    -- is  silently  ignored  and  the character maps to itself."
-
-escIgnore :: PandocMonad m
-          => Char
-          -> [RoffLexer m T.Text]
-          -> RoffLexer m T.Text
-escIgnore c argparsers = do
-  pos <- getPosition
-  arg <- snd <$> withRaw (choice argparsers) <|> return ""
-  report $ SkippedContent ("\\" <> T.cons c arg) pos
-  return mempty
-
-escUnknown :: PandocMonad m => T.Text -> RoffLexer m T.Text
-escUnknown s = do
-  pos <- getPosition
-  report $ SkippedContent s pos
-  return "\xFFFD"
-
-signedNumber :: PandocMonad m => RoffLexer m T.Text
-signedNumber = try $ do
-  sign <- option "" ("-" <$ char '-' <|> "" <$ char '+')
-  ds <- many1Char digit
-  return (sign <> ds)
-
--- Parses: [..] or (..
-escapeArg :: PandocMonad m => RoffLexer m T.Text
-escapeArg = choice
-    [ char '[' *> manyTillChar (noneOf ['\n',']']) (char ']')
-    , char '(' *> countChar 2 (satisfy (/='\n'))
-    ]
-
--- Parses: '..'
-quoteArg :: PandocMonad m => RoffLexer m T.Text
-quoteArg = char '\'' *> manyTillChar (noneOf ['\n','\'']) (char '\'')
 
 -- separate function from lexMacro since real man files sometimes do not
 -- follow the rules
-lexComment :: PandocMonad m => RoffLexer m MdocTokens
+lexComment :: PandocMonad m => Lexer m MdocTokens
 lexComment = do
   try $ string ".\\\""
   skipMany $ noneOf "\n"
@@ -263,13 +114,13 @@ lexComment = do
   return mempty
 
 
-argText :: PandocMonad m => RoffLexer m T.Text
+argText :: PandocMonad m => Lexer m T.Text
 argText = mconcat <$> many1 (escape <|> regularText)
 
-spaceTabChar :: PandocMonad m => RoffLexer m T.Text
+spaceTabChar :: PandocMonad m => Lexer m T.Text
 spaceTabChar = T.singleton <$> spaceChar
 
-quotedArg :: PandocMonad m => RoffLexer m T.Text
+quotedArg :: PandocMonad m => Lexer m T.Text
 quotedArg = do
   quoteChar
   t <- mconcat <$> many (escape <|> regularText <|> innerQuote <|> spaceTabChar)
@@ -281,23 +132,19 @@ quotedArg = do
       string "\"\""
       return "\""
 
-anyText :: PandocMonad m => RoffLexer m T.Text
+anyText :: PandocMonad m => Lexer m T.Text
 anyText = escape <|> regularText <|> quoteChar <|> spaceTabChar
 
-backslash :: PandocMonad m => RoffLexer m ()
-backslash = 
-  (mempty <* char '\\') <|> (mempty <* string "\\E")
-
-regularText :: PandocMonad m => RoffLexer m T.Text
+regularText :: PandocMonad m => Lexer m T.Text
 regularText = many1Char $ noneOf "\n\r\t \\\""
 
-quoteChar :: PandocMonad m => RoffLexer m T.Text
+quoteChar :: PandocMonad m => Lexer m T.Text
 quoteChar = T.singleton <$> char '"'
 
-mdocToken :: PandocMonad m => RoffLexer m MdocTokens
+mdocToken :: PandocMonad m => Lexer m MdocTokens
 mdocToken = lexComment <|> lexControlLine <|> lexTextLine
 
-lexMacro :: PandocMonad m => RoffLexer m MdocToken
+lexMacro :: PandocMonad m => Lexer m MdocToken
 lexMacro = do
   pos <- getPosition
   name <- many1Char (satisfy isMacroChar)
@@ -307,7 +154,7 @@ lexMacro = do
     isMacroChar '%' = True
     isMacroChar x = isAlphaNum x
 
-lexDelim :: PandocMonad m => RoffLexer m MdocToken
+lexDelim :: PandocMonad m => Lexer m MdocToken
 lexDelim = do
   pos <- getPosition
   t <- Delim Open <$> oneOfStrings ["(", "["] <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"]
@@ -315,7 +162,7 @@ lexDelim = do
   skipSpaces
   return $ t pos
 
-lexLit :: PandocMonad m => RoffLexer m MdocToken
+lexLit :: PandocMonad m => Lexer m MdocToken
 lexLit = do
   pos <- getPosition
   t <- argText <|> quotedArg
@@ -323,7 +170,7 @@ lexLit = do
   skipSpaces
   return $ Lit t pos
 
-lexTextLine :: PandocMonad m => RoffLexer m MdocTokens
+lexTextLine :: PandocMonad m => Lexer m MdocTokens
 lexTextLine = do
   pos <- getPosition
   guard $ sourceColumn pos == 1
@@ -333,7 +180,7 @@ lexTextLine = do
      then return $ singleTok $ Blank pos
      else return $ singleTok $ Str t pos
 
-lexControlLine :: PandocMonad m => RoffLexer m MdocTokens
+lexControlLine :: PandocMonad m => Lexer m MdocTokens
 lexControlLine = do
   pos <- getPosition
   guard $ sourceColumn pos == 1
diff --git a/src/Text/Pandoc/Readers/Roff.hs b/src/Text/Pandoc/Readers/Roff.hs
index 429b90ce16ca..60e3ac6e7b8f 100644
--- a/src/Text/Pandoc/Readers/Roff.hs
+++ b/src/Text/Pandoc/Readers/Roff.hs
@@ -2,6 +2,7 @@
 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE ViewPatterns #-}
+{-# LANGUAGE TypeFamilies #-}
 {- |
    Module      : Text.Pandoc.Readers.Roff
    Copyright   : Copyright (C) 2018-2020 Yan Pashkovsky and John MacFarlane
@@ -29,11 +30,11 @@ module Text.Pandoc.Readers.Roff
 where
 
 import Safe (lastDef)
-import Control.Monad (void, mzero, mplus, guard)
+import Control.Monad (void, guard)
 import Control.Monad.Except (throwError)
 import Text.Pandoc.Class.PandocMonad
        (getResourcePath, readFileFromDirs, PandocMonad(..), report)
-import Data.Char (isLower, toLower, toUpper, chr, isAscii, isAlphaNum)
+import Data.Char (isLower, toLower, toUpper, isAlphaNum)
 import Data.Default (Default)
 import qualified Data.Map as M
 import Data.List (intercalate)
@@ -42,10 +43,9 @@ import Text.Pandoc.Logging (LogMessage(..))
 import Text.Pandoc.Options
 import Text.Pandoc.Parsing
 import Text.Pandoc.Shared (safeRead)
-import Text.Pandoc.RoffChar (characterCodes, combiningAccents)
+import Text.Pandoc.Readers.Roff.Escape
 import qualified Data.Sequence as Seq
 import qualified Data.Foldable as Foldable
-import qualified Data.Text.Normalize as Normalize
 
 -- import Debug.Trace (traceShowId)
 
@@ -65,6 +65,52 @@ data LinePart = RoffStr T.Text
               | MacroArg Int
               deriving Show
 
+instance RoffMonad RoffTokens where
+  type Token RoffTokens = [LinePart]
+  type State RoffTokens = RoffState
+  emit t = [RoffStr t]
+  expandString = try $ do
+    pos <- getPosition
+    char '\\'
+    char '*'
+    cs <- escapeArg <|> countChar 1 anyChar
+    s <- linePartsToText <$> resolveText cs pos
+    addToInput s
+  escString = try $ do
+    pos <- getPosition
+    (do cs <- escapeArg <|> countChar 1 anyChar
+        resolveText cs pos)
+      <|> mempty <$ char 'S'
+  backslash = do
+    char '\\'
+    mode <- roffMode <$> getState
+    case mode of
+      -- experimentally, it seems you don't always need to double
+      -- the backslash in macro defs.  It's essential with \\$1,
+      -- but not with \\f[I].  So we make the second one optional.
+      CopyMode   -> optional $ char '\\'
+      NormalMode -> return ()
+  escE = do
+      mode <- roffMode <$> getState
+      case mode of
+        CopyMode   -> return mempty
+        NormalMode -> return [RoffStr "\\"]
+  escFont = do
+    font <- escapeArg <|> countChar 1 alphaNum
+    font' <- if T.null font || font == "P"
+                then prevFont <$> getState
+                else return $ foldr processFontLetter defaultFontSpec $ T.unpack font
+    updateState $ \st -> st{ prevFont = currentFont st
+                           , currentFont = font' }
+    return [Font font']
+    where
+      processFontLetter c fs
+                | isLower c    = processFontLetter (toUpper c) fs
+      processFontLetter 'B' fs = fs{ fontBold = True }
+      processFontLetter 'I' fs = fs{ fontItalic = True }
+      processFontLetter 'C' fs = fs{ fontMonospace = True }
+      processFontLetter _   fs = fs -- do nothing
+
 type Arg = [LinePart]
 
 type TableOption = (T.Text, T.Text)
@@ -133,198 +179,6 @@ eofline = void newline <|> eof <|> () <$ lookAhead (string "\\}")
 spacetab :: (Stream s m Char, UpdateSourcePos s Char) => ParsecT s u m Char
 spacetab = char ' ' <|> char '\t'
 
-characterCodeMap :: M.Map T.Text Char
-characterCodeMap =
-  M.fromList $ map (\(x,y) -> (y,x)) characterCodes
-
-combiningAccentsMap :: M.Map T.Text Char
-combiningAccentsMap =
-  M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
-
-escape :: PandocMonad m => RoffLexer m [LinePart]
-escape = try $ do
-  backslash
-  escapeGlyph <|> escapeNormal
-
-escapeGlyph :: PandocMonad m => RoffLexer m [LinePart]
-escapeGlyph = do
-  c <- lookAhead (oneOf ['[','('])
-  escapeArg >>= resolveGlyph c
-
-resolveGlyph :: PandocMonad m => Char -> T.Text -> RoffLexer m [LinePart]
-resolveGlyph delimChar glyph = do
-  let cs = T.replace "_u" " u" glyph -- unicode glyphs separated by _
-  (case T.words cs of
-      []  -> mzero
-      [s] -> case M.lookup s characterCodeMap `mplus` readUnicodeChar s of
-               Nothing -> mzero
-               Just c  -> return [RoffStr $ T.singleton c]
-      (s:ss) -> do
-        basechar <- case M.lookup s characterCodeMap `mplus`
-                         readUnicodeChar s of
-                      Nothing ->
-                        case T.unpack s of
-                          [ch] | isAscii ch && isAlphaNum ch ->
-                                 return ch
-                          _ -> mzero
-                      Just c  -> return c
-        let addAccents [] xs = return $ Normalize.normalize Normalize.NFC $
-                                        T.reverse xs
-            addAccents (a:as) xs =
-              case M.lookup a combiningAccentsMap `mplus` readUnicodeChar a of
-                Just x  -> addAccents as $ T.cons x xs
-                Nothing -> mzero
-        addAccents ss (T.singleton basechar) >>= \xs -> return [RoffStr xs])
-      <|> case delimChar of
-            '['  -> escUnknown ("\\[" <> glyph <> "]")
-            '('  -> escUnknown ("\\(" <> glyph)
-            '\'' -> escUnknown ("\\C'" <> glyph <> "'")
-            _    -> Prelude.fail "resolveGlyph: unknown glyph delimiter"
-
-readUnicodeChar :: T.Text -> Maybe Char
-readUnicodeChar t = case T.uncons t of
-  Just ('u', cs) | T.length cs > 3 -> chr <$> safeRead ("0x" <> cs)
-  _ -> Nothing
-
-escapeNormal :: PandocMonad m => RoffLexer m [LinePart]
-escapeNormal = do
-  c <- noneOf "{}"
-  optional expandString
-  case c of
-    ' ' -> return [RoffStr " "]
-    '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
-    '#' -> mempty <$ manyTill anyChar newline
-    '%' -> return mempty  -- optional hyphenation
-    '&' -> return mempty  -- nonprintable zero-width
-    ')' -> return mempty  -- nonprintable zero-width
-    '*' -> escString
-    ',' -> return mempty  -- to fix spacing after roman
-    '-' -> return [RoffStr "-"]
-    '.' -> return [RoffStr "."]
-    '/' -> return mempty  -- to fix spacing before roman
-    '0' -> return [RoffStr "\x2007"] -- digit-width space
-    ':' -> return mempty  -- zero-width break
-    'A' -> quoteArg >>= checkDefined
-    'B' -> escIgnore 'B' [quoteArg]
-    'C' -> quoteArg >>= resolveGlyph '\''
-    'D' -> escIgnore 'D' [quoteArg]
-    'E' -> do
-      mode <- roffMode <$> getState
-      case mode of
-        CopyMode   -> return mempty
-        NormalMode -> return [RoffStr "\\"]
-    'H' -> escIgnore 'H' [quoteArg]
-    'L' -> escIgnore 'L' [quoteArg]
-    'M' -> escIgnore 'M' [escapeArg, countChar 1 (satisfy (/='\n'))]
-    'N' -> escIgnore 'N' [quoteArg]
-    'O' -> escIgnore 'O' [countChar 1 (oneOf ['0','1'])]
-    'R' -> escIgnore 'R' [quoteArg]
-    'S' -> escIgnore 'S' [quoteArg]
-    'V' -> escIgnore 'V' [escapeArg, countChar 1 alphaNum]
-    'X' -> escIgnore 'X' [quoteArg]
-    'Y' -> escIgnore 'Y' [escapeArg, countChar 1 (satisfy (/='\n'))]
-    'Z' -> escIgnore 'Z' [quoteArg]
-    '\'' -> return [RoffStr "'"]
-    '\n' -> return mempty  -- line continuation
-    '^' -> return [RoffStr "\x200A"] -- 1/12 em space
-    '_' -> return [RoffStr "_"]
-    '`' -> return [RoffStr "`"]
-    'a' -> return mempty  -- "non-interpreted leader character"
-    'b' -> escIgnore 'b' [quoteArg]
-    'c' -> return mempty  -- interrupt text processing
-    'd' -> escIgnore 'd' [] -- forward down 1/2em
-    'e' -> return [RoffStr "\\"]
-    'f' -> escFont
-    'g' -> escIgnore 'g' [escapeArg, countChar 1 (satisfy (/='\n'))]
-    'h' -> escIgnore 'h' [quoteArg]
-    'k' -> escIgnore 'k' [escapeArg, countChar 1 (satisfy (/='\n'))]
-    'l' -> escIgnore 'l' [quoteArg]
-    'm' -> escIgnore 'm' [escapeArg, countChar 1 (satisfy (/='\n'))]
-    'n' -> escIgnore 'm' [escapeArg, countChar 1 (satisfy (/='\n'))]
-    'o' -> escIgnore 'o' [quoteArg]
-    'p' -> escIgnore 'p' []
-    'r' -> escIgnore 'r' []
-    's' -> escIgnore 's' [escapeArg, signedNumber]
-    't' -> return [RoffStr "\t"]
-    'u' -> escIgnore 'u' []
-    'v' -> escIgnore 'v' [quoteArg]
-    'w' -> escIgnore 'w' [quoteArg]
-    'x' -> escIgnore 'x' [quoteArg]
-    'z' -> escIgnore 'z' [countChar 1 anyChar]
-    '|' -> return [RoffStr "\x2006"] --1/6 em space
-    '~' -> return [RoffStr "\160"] -- nonbreaking space
-    '\\' -> do
-      mode <- roffMode <$> getState
-      case mode of
-        CopyMode   -> char '\\'
-        NormalMode -> return '\\'
-      return [RoffStr "\\"]
-    _   -> return [RoffStr $ T.singleton c]
-    -- man 7 groff: "If  a  backslash  is followed by a character that
-    -- does not constitute a defined escape sequence, the backslash
-    -- is  silently  ignored  and  the character maps to itself."
-
-escIgnore :: PandocMonad m
-          => Char
-          -> [RoffLexer m T.Text]
-          -> RoffLexer m [LinePart]
-escIgnore c argparsers = do
-  pos <- getPosition
-  arg <- snd <$> withRaw (choice argparsers) <|> return ""
-  report $ SkippedContent ("\\" <> T.cons c arg) pos
-  return mempty
-
-escUnknown :: PandocMonad m => T.Text -> RoffLexer m [LinePart]
-escUnknown s = do
-  pos <- getPosition
-  report $ SkippedContent s pos
-  return [RoffStr "\xFFFD"]
-
-signedNumber :: PandocMonad m => RoffLexer m T.Text
-signedNumber = try $ do
-  sign <- option "" ("-" <$ char '-' <|> "" <$ char '+')
-  ds <- many1Char digit
-  return (sign <> ds)
-
--- Parses: [..] or (..
-escapeArg :: PandocMonad m => RoffLexer m T.Text
-escapeArg = choice
-    [ char '[' *> optional expandString *>
-                  manyTillChar (noneOf ['\n',']']) (char ']')
-    , char '(' *> optional expandString *>
-                  countChar 2 (satisfy (/='\n'))
-    ]
-
-expandString :: PandocMonad m => RoffLexer m ()
-expandString = try $ do
-  pos <- getPosition
-  char '\\'
-  char '*'
-  cs <- escapeArg <|> countChar 1 anyChar
-  s <- linePartsToText <$> resolveText cs pos
-  addToInput s
-
--- Parses: '..'
-quoteArg :: PandocMonad m => RoffLexer m T.Text
-quoteArg = char '\'' *> manyTillChar (noneOf ['\n','\'']) (char '\'')
-
-escFont :: PandocMonad m => RoffLexer m [LinePart]
-escFont = do
-  font <- escapeArg <|> countChar 1 alphaNum
-  font' <- if T.null font || font == "P"
-              then prevFont <$> getState
-              else return $ foldr processFontLetter defaultFontSpec $ T.unpack font
-  updateState $ \st -> st{ prevFont = currentFont st
-                         , currentFont = font' }
-  return [Font font']
-  where
-    processFontLetter c fs
-              | isLower c    = processFontLetter (toUpper c) fs
-    processFontLetter 'B' fs = fs{ fontBold = True }
-    processFontLetter 'I' fs = fs{ fontItalic = True }
-    processFontLetter 'C' fs = fs{ fontMonospace = True }
-    processFontLetter _   fs = fs -- do nothing
-
 -- separate function from lexMacro since real man files sometimes do not
 -- follow the rules
 lexComment :: PandocMonad m => RoffLexer m RoffTokens
@@ -624,20 +478,6 @@ lexArgs = do
         char '"'
         return [RoffStr "\""]
 
-checkDefined :: PandocMonad m => T.Text -> RoffLexer m [LinePart]
-checkDefined name = do
-  macros <- customMacros <$> getState
-  case M.lookup name macros of
-    Just _  -> return [RoffStr "1"]
-    Nothing -> return [RoffStr "0"]
-
-escString :: PandocMonad m => RoffLexer m [LinePart]
-escString = try $ do
-  pos <- getPosition
-  (do cs <- escapeArg <|> countChar 1 anyChar
-      resolveText cs pos)
-    <|> mempty <$ char 'S'
-
 -- strings and macros share namespace
 resolveText :: PandocMonad m
               => T.Text -> SourcePos -> RoffLexer m [LinePart]
@@ -668,17 +508,6 @@ linePart :: PandocMonad m => RoffLexer m [LinePart]
 linePart = macroArg <|> escape <|>
            regularText <|> quoteChar <|> spaceTabChar
 
-backslash :: PandocMonad m => RoffLexer m ()
-backslash = do
-  char '\\'
-  mode <- roffMode <$> getState
-  case mode of
-    -- experimentally, it seems you don't always need to double
-    -- the backslash in macro defs.  It's essential with \\$1,
-    -- but not with \\f[I].  So we make the second one optional.
-    CopyMode   -> optional $ char '\\'
-    NormalMode -> return ()
-
 macroArg :: PandocMonad m => RoffLexer m [LinePart]
 macroArg = try $ do
   pos <- getPosition
diff --git a/src/Text/Pandoc/Readers/Roff/Escape.hs b/src/Text/Pandoc/Readers/Roff/Escape.hs
new file mode 100644
index 000000000000..109c8c92f1ba
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Roff/Escape.hs
@@ -0,0 +1,198 @@
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE TypeFamilyDependencies #-}
+module Text.Pandoc.Readers.Roff.Escape
+  ( escape,
+    escapeArg,
+    RoffMonad(..),
+  )
+where
+import Text.Pandoc.Class.PandocMonad
+    ( PandocMonad(..), report, PandocMonad(..), report )
+import Control.Monad
+    ( mzero, mplus, mzero, mplus )
+import Data.Char (chr, isAscii, isAlphaNum)
+import qualified Data.Map as M
+import qualified Data.Text as T
+import Text.Pandoc.Logging (LogMessage(..))
+import Text.Pandoc.Parsing
+import Text.Pandoc.Shared (safeRead)
+import qualified Data.Text.Normalize as Normalize
+import Text.Pandoc.RoffChar (characterCodes, combiningAccents)
+
+type Lexer m x = ParsecT Sources (State x) m
+
+class (Monoid (Token x)) => RoffMonad x where
+  type State x = a | a -> x
+  type Token x = a | a -> x
+  emit :: T.Text -> Token x
+  expandString :: PandocMonad m => Lexer m x ()
+  escString :: PandocMonad m => Lexer m x (Token x)
+  backslash :: PandocMonad m => Lexer m x ()
+  checkDefined :: PandocMonad m => T.Text -> Lexer m x (Token x)
+  checkDefined _ = mempty
+  escE :: PandocMonad m => Lexer m x (Token x)
+  escE = return mempty
+  escFont :: PandocMonad m => Lexer m x (Token x)
+  escFont = escIgnore 'f' [escapeArg, countChar 1 (satisfy (/='\n'))]
+
+
+characterCodeMap :: M.Map T.Text Char
+characterCodeMap =
+  M.fromList $ map (\(x,y) -> (y,x)) characterCodes
+
+combiningAccentsMap :: M.Map T.Text Char
+combiningAccentsMap =
+  M.fromList $ map (\(x,y) -> (y,x)) combiningAccents
+
+escape ::  (PandocMonad m, RoffMonad x) => Lexer m x (Token x)
+escape = try $ do
+  backslash
+  escapeGlyph <|> escapeNormal
+
+escapeGlyph :: (PandocMonad m, RoffMonad x) => Lexer m x (Token x)
+escapeGlyph = do
+  c <- lookAhead (oneOf ['[','('])
+  escapeArg >>= resolveGlyph c
+
+resolveGlyph :: (PandocMonad m, RoffMonad x) => Char -> T.Text -> Lexer m x (Token x)
+resolveGlyph delimChar glyph = do
+  let cs = T.replace "_u" " u" glyph -- unicode glyphs separated by _
+  (case T.words cs of
+      []  -> mzero
+      [s] -> case M.lookup s characterCodeMap `mplus` readUnicodeChar s of
+               Nothing -> mzero
+               Just c  -> return $ emit $ T.singleton c
+      (s:ss) -> do
+        basechar <- case M.lookup s characterCodeMap `mplus`
+                         readUnicodeChar s of
+                      Nothing ->
+                        case T.unpack s of
+                          [ch] | isAscii ch && isAlphaNum ch ->
+                                 return ch
+                          _ -> mzero
+                      Just c  -> return c
+        let addAccents [] xs = return $ Normalize.normalize Normalize.NFC $
+                                        T.reverse xs
+            addAccents (a:as) xs =
+              case M.lookup a combiningAccentsMap `mplus` readUnicodeChar a of
+                Just x  -> addAccents as $ T.cons x xs
+                Nothing -> mzero
+        addAccents ss (T.singleton basechar) >>= \xs -> return $ emit xs)
+      <|> case delimChar of
+            '['  -> escUnknown ("\\[" <> glyph <> "]")
+            '('  -> escUnknown ("\\(" <> glyph)
+            '\'' -> escUnknown ("\\C'" <> glyph <> "'")
+            _    -> Prelude.fail "resolveGlyph: unknown glyph delimiter"
+
+readUnicodeChar :: T.Text -> Maybe Char
+readUnicodeChar t = case T.uncons t of
+  Just ('u', cs) | T.length cs > 3 -> chr <$> safeRead ("0x" <> cs)
+  _ -> Nothing
+
+escapeNormal :: (PandocMonad m, RoffMonad x) => Lexer m x (Token x)
+escapeNormal = do
+  c <- noneOf "{}"
+  optional expandString
+  let groffSkip = [escapeArg, countChar 1 (satisfy (/='\n'))]
+  case c of
+    ' ' -> return $ emit " " -- mandoc_char(7) says this should be a nonbreaking space
+    '"' -> mempty <$ skipMany (satisfy (/='\n')) -- line comment
+    '#' -> mempty <$ manyTill anyChar newline
+    '%' -> return mempty  -- optional hyphenation
+    '&' -> return mempty  -- nonprintable zero-width
+    ')' -> return mempty  -- nonprintable zero-width
+    '*' -> escString
+    ',' -> return mempty  -- to fix spacing after roman
+    '-' -> return $ emit "-"
+    '.' -> return $ emit "."
+    '/' -> return mempty  -- to fix spacing before roman
+    '0' -> return $ emit "\x2007" -- digit-width space
+    ':' -> return mempty  -- zero-width break
+    'A' -> quoteArg >>= checkDefined
+    'B' -> escIgnore 'B' [quoteArg]
+    'C' -> quoteArg >>= resolveGlyph '\''
+    'D' -> escIgnore 'D' [quoteArg]
+    'E' -> escE
+    'F' -> escIgnore 'F' groffSkip
+    'H' -> escIgnore 'H' [quoteArg]
+    'L' -> escIgnore 'L' [quoteArg]
+    'M' -> escIgnore 'M' groffSkip
+    'N' -> escIgnore 'N' [quoteArg]
+    'O' -> escIgnore 'O' [countChar 1 (oneOf ['0','1'])]
+    'R' -> escIgnore 'R' [quoteArg]
+    'S' -> escIgnore 'S' [quoteArg]
+    'V' -> escIgnore 'V' groffSkip
+    'X' -> escIgnore 'X' [quoteArg]
+    'Y' -> escIgnore 'Y' groffSkip
+    'Z' -> escIgnore 'Z' [quoteArg]
+    '\'' -> return $ emit "'"
+    '\n' -> return mempty  -- line continuation
+    '^' -> return $ emit "\x200A" -- 1/12 em space
+    '_' -> return $ emit "_"
+    '`' -> return $ emit "`"
+    'a' -> return mempty  -- "non-interpreted leader character"
+    'b' -> escIgnore 'b' [quoteArg]
+    'c' -> return mempty  -- interrupt text processing
+    'd' -> escIgnore 'd' [] -- forward down 1/2em
+    'e' -> return $ emit "\\"
+    'f' -> escFont
+    'g' -> escIgnore 'g' groffSkip
+    'h' -> escIgnore 'h' [quoteArg]
+    'k' -> escIgnore 'k' groffSkip
+    'l' -> escIgnore 'l' [quoteArg]
+    'm' -> escIgnore 'm' groffSkip
+    'n' -> escIgnore 'm' groffSkip
+    'o' -> escIgnore 'o' [quoteArg]
+    'p' -> escIgnore 'p' []
+    'r' -> escIgnore 'r' []
+    's' -> escIgnore 's' [escapeArg, signedNumber]
+    't' -> return $ emit "\t"
+    'u' -> escIgnore 'u' []
+    'v' -> escIgnore 'v' [quoteArg]
+    'w' -> escIgnore 'w' [quoteArg]
+    'x' -> escIgnore 'x' [quoteArg]
+    'z' -> escIgnore 'z' [countChar 1 anyChar]
+    '|' -> return $ emit "\x2006" --1/6 em space
+    '~' -> return $ emit "\160" -- nonbreaking space
+    '\\' -> return $ emit "\\"
+    _   -> return $ emit $ T.singleton c
+    -- man 7 groff: "If  a  backslash  is followed by a character that
+    -- does not constitute a defined escape sequence, the backslash
+    -- is  silently  ignored  and  the character maps to itself."
+
+escIgnore :: (PandocMonad m, RoffMonad x)
+          => Char
+          -> [Lexer m x T.Text]
+          -> Lexer m x (Token x)
+escIgnore c argparsers = do
+  pos <- getPosition
+  arg <- snd <$> withRaw (choice argparsers) <|> return ""
+  report $ SkippedContent ("\\" <> T.cons c arg) pos
+  return mempty
+
+escUnknown :: (PandocMonad m, RoffMonad x) => T.Text -> Lexer m x (Token x)
+escUnknown s = do
+  pos <- getPosition
+  report $ SkippedContent s pos
+  return $ emit "\xFFFD"
+
+signedNumber :: (PandocMonad m, RoffMonad x) => Lexer m x T.Text
+signedNumber = try $ do
+  sign <- option "" ("-" <$ char '-' <|> "" <$ char '+')
+  ds <- many1Char digit
+  return (sign <> ds)
+
+-- Parses: [..] or (..
+escapeArg :: (PandocMonad m, RoffMonad x) => Lexer m x T.Text
+escapeArg = choice
+    [ char '[' *> optional expandString *>
+                  manyTillChar (noneOf ['\n',']']) (char ']')
+    , char '(' *> optional expandString *>
+                  countChar 2 (satisfy (/='\n'))
+    ]
+
+-- Parses: '..'
+quoteArg :: (PandocMonad m, RoffMonad x) => Lexer m x T.Text
+quoteArg = char '\'' *> manyTillChar (noneOf ['\n','\'']) (char '\'')

From a714f9ddb8d8fe8cbfdbb969e3da52738628de23 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 12 Sep 2024 16:14:54 -0700
Subject: [PATCH 18/81] Comment upon my approach

---
 src/Text/Pandoc/Readers/Mdoc.hs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 218130c9934d..a7da4c2e0100 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -30,6 +30,11 @@ import qualified Text.Pandoc.Parsing as P
 import qualified Data.Foldable as Foldable
 import Text.Pandoc.Shared (stringify)
 
+  {- As a general principle, if mandoc -T lint issues a WARNING admonition
+     or worse about a construct, I consider it fair game for this reader to
+     do something different than what mandoc does with it, including bailing
+     out instead of recovering. -}
+
 data MdocSection
   = ShName
   | ShSynopsis

From 44316511ec1fcdc296cc7128543a86e5bb96d547 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 12 Sep 2024 16:17:02 -0700
Subject: [PATCH 19/81] Add Xr parser

---
 src/Text/Pandoc/Readers/Mdoc.hs | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index a7da4c2e0100..6b0954bf5643 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -232,6 +232,14 @@ litsToText = do
   let strs = map (B.str . toString) ls
   return $ mconcat $ intersperse B.space strs
 
+delimitedArgs :: PandocMonad m => MdocParser m x -> MdocParser m (Inlines, x, Inlines)
+delimitedArgs p = do
+    openDelim <- mconcat <$> many (parseDelim Open)
+    inlines <- p
+    closeDelim <- mconcat <$> many (parseDelim Close)
+    return (openDelim, inlines, closeDelim)
+
+-- TODO extract further?
 simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines
 simpleInline nm xform = do
   macro nm
@@ -239,9 +247,7 @@ simpleInline nm xform = do
   return $ mconcat $ intersperse B.space segs
  where
    segment = do
-      openDelim <- mconcat <$> many (parseDelim Open)
-      inlines <- option mempty litsToText
-      closeDelim <- mconcat <$> many (parseDelim Close)
+      (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText
       let xform' x = if null x then mempty else xform x
       return $ openDelim <> xform' inlines <> closeDelim
 
@@ -255,8 +261,22 @@ parseSy = simpleInline "Sy" B.strong
 parseEm :: PandocMonad m => MdocParser m Inlines
 parseEm = simpleInline "Em" B.emph
 
+-- Xr
+parseXr :: PandocMonad m => MdocParser m Inlines
+parseXr = do
+  macro "Xr"
+  (open, (name, section), close) <- delimitedArgs f
+  let ref = name <> "(" <> section <> ")"
+  return $ open <> B.spanWith attr (B.str ref) <> close
+    where
+      f = do
+        n <- lit <?> "Xr manual name"
+        s <- lit <?> "Xr manual section"
+        return (toString n, toString s)
+      attr = (mempty, ["Xr"], mempty)
+
 parseInlineMacro :: PandocMonad m => MdocParser m Inlines
-parseInlineMacro = choice [ parseSy, parseEm ]
+parseInlineMacro = choice [ parseSy, parseEm, parseXr ]
 
 -- TODO this doesn't handle inline macros being interrupted
 -- by other ones yet, but the lexer doesn't handle it yet

From 759d327de20e71b0dbe9dfb8e26f17892e5e9977 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 12 Sep 2024 17:32:40 -0700
Subject: [PATCH 20/81] Lex callable macros in parsed control lines

See mdoc(7) section MACRO SYNTAX
---
 pandoc.cabal                           |   2 +
 src/Text/Pandoc/Readers/Mdoc/Lex.hs    |  15 +-
 src/Text/Pandoc/Readers/Mdoc/Macros.hs | 185 +++++++++++++++++++++++++
 3 files changed, 200 insertions(+), 2 deletions(-)
 create mode 100644 src/Text/Pandoc/Readers/Mdoc/Macros.hs

diff --git a/pandoc.cabal b/pandoc.cabal
index b885c3db52f1..39508116c1fe 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -522,6 +522,7 @@ library
                  split                 >= 0.2      && < 0.3,
                  syb                   >= 0.1      && < 0.8,
                  tagsoup               >= 0.14.6   && < 0.15,
+                 template-haskell,
                  temporary             >= 1.1      && < 1.4,
                  texmath               >= 0.12.8.10 && < 0.13,
                  text                  >= 1.1.1.0  && < 2.2,
@@ -702,6 +703,7 @@ library
                    Text.Pandoc.Readers.LaTeX.Parsing,
                    Text.Pandoc.Readers.LaTeX.SIunitx,
                    Text.Pandoc.Readers.LaTeX.Table,
+                   Text.Pandoc.Readers.Mdoc.Macros,
                    Text.Pandoc.Readers.Typst.Parsing,
                    Text.Pandoc.Readers.Typst.Math,
                    Text.Pandoc.Readers.ODT.Base,
diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
index 03e3f999aa35..db58ac27d204 100644
--- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -33,6 +33,7 @@ import qualified Data.Text as T
 import Text.Pandoc.Options
 import Text.Pandoc.Parsing
 import Text.Pandoc.Readers.Roff.Escape
+import Text.Pandoc.Readers.Mdoc.Macros
 import qualified Data.Sequence as Seq
 
 -- import Debug.Trace (traceShowId)
@@ -154,6 +155,12 @@ lexMacro = do
     isMacroChar '%' = True
     isMacroChar x = isAlphaNum x
 
+lexCallableMacro :: PandocMonad m => Lexer m MdocToken
+lexCallableMacro = do
+  m@(Macro name _) <- lexMacro
+  guard $ isCallableMacro name
+  return m
+
 lexDelim :: PandocMonad m => Lexer m MdocToken
 lexDelim = do
   pos <- getPosition
@@ -185,9 +192,13 @@ lexControlLine = do
   pos <- getPosition
   guard $ sourceColumn pos == 1
   char '.'
-  m <- lexMacro
-  (wds, e) <- manyUntil (try lexDelim <|> lexLit) eofline
+  m@(Macro name _) <- lexMacro
+  let parsed = isParsedMacro name
+  (wds, e) <- manyUntil (l parsed) eofline
   return $ MdocTokens $ Seq.fromList $ (m:wds) <> [e]
+    where
+      l True = try lexDelim <|> try lexCallableMacro <|> lexLit
+      l False = try lexDelim <|> lexLit
 
 -- | Tokenize a string as a sequence of roff tokens.
 lexMdoc :: PandocMonad m => SourcePos -> T.Text -> m MdocTokens
diff --git a/src/Text/Pandoc/Readers/Mdoc/Macros.hs b/src/Text/Pandoc/Readers/Mdoc/Macros.hs
new file mode 100644
index 000000000000..174a0de39bbd
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Mdoc/Macros.hs
@@ -0,0 +1,185 @@
+{-# LANGUAGE TemplateHaskell #-}
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Mdoc.Macros
+   Copyright   : 
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : a
+   Stability   : WIP
+   Portability : portable
+
+-}
+module Text.Pandoc.Readers.Mdoc.Macros (isParsedMacro, isCallableMacro) where
+
+import Data.Set (member, fromList, Set)
+import Data.Text
+import Language.Haskell.TH.Syntax (lift)
+
+isParsedMacro :: Text -> Bool
+isParsedMacro a = member a parsedMacros
+
+isCallableMacro :: Text -> Bool
+isCallableMacro a = member a callableMacros
+
+parsedMacros :: Set Text
+parsedMacros = $(lift (fromList [
+  "Ac",
+  "Ad",
+  "An",
+  "Ao",
+  "Ap",
+  "Aq",
+  "Ar",
+  "At",
+  "Bc",
+  "Bo",
+  "Bq",
+  "Brc",
+  "Bro",
+  "Brq",
+  "Bsx",
+  "Bx",
+  "Cd",
+  "Cm",
+  "D1",
+  "Dc",
+  "Dl",
+  "Do",
+  "Dq",
+  "Dv",
+  "Dx",
+  "Ec",
+  "Em",
+  "En",
+  "Eo",
+  "Er",
+  "Es",
+  "Ev",
+  "Fa",
+  "Fc",
+  "Fl",
+  "Fn",
+  "Fr",
+  "Ft",
+  "Fx",
+  "Ic",
+  "In",
+  "It",
+  "Li",
+  "Lk",
+  "Ms",
+  "Mt",
+  "Nm",
+  "No",
+  "Ns",
+  "Nx",
+  "Oc",
+  "Oo",
+  "Op",
+  "Ot",
+  "Ox",
+  "Pa",
+  "Pc",
+  "Pf",
+  "Po",
+  "Pq",
+  "Qc",
+  "Ql",
+  "Qo",
+  "Qq",
+  "Sc",
+  "Sh",
+  "So",
+  "Sq",
+  "Ss",
+  "St",
+  "Sx",
+  "Sy",
+  "Ta",
+  "Tn",
+  "Ux",
+  "Va",
+  "Vt",
+  "Xc",
+  "Xo",
+  "Xr"] :: Set Text))
+
+callableMacros :: Set Text
+callableMacros = $(lift (fromList [
+  "Ac",
+  "Ad",
+  "An",
+  "Ao",
+  "Ap",
+  "Aq",
+  "Ar",
+  "At",
+  "Bc",
+  "Bo",
+  "Bq",
+  "Brc",
+  "Bro",
+  "Brq",
+  "Bsx",
+  "Bx",
+  "Cd",
+  "Cm",
+  "Dc",
+  "Do",
+  "Dq",
+  "Dv",
+  "Dx",
+  "Ec",
+  "Em",
+  "En",
+  "Eo",
+  "Er",
+  "Es",
+  "Ev",
+  "Fa",
+  "Fc",
+  "Fl",
+  "Fn",
+  "Fo",
+  "Fr",
+  "Ft",
+  "Fx",
+  "Ic",
+  "In",
+  "Li",
+  "Lk",
+  "Ms",
+  "Mt",
+  "Nm",
+  "No",
+  "Ns",
+  "Nx",
+  "Oc",
+  "Oo",
+  "Op",
+  "Ot",
+  "Ox",
+  "Pa",
+  "Pc",
+  "Pf",
+  "Po",
+  "Pq",
+  "Qc",
+  "Ql",
+  "Qo",
+  "Qq",
+  "Sc",
+  "So",
+  "Sq",
+  "St",
+  "Sx",
+  "Sy",
+  "Ta",
+  "Tn",
+  "Ux",
+  "Va",
+  "Vt",
+  "Xc",
+  "Xo",
+  "Xr"] :: Set Text))

From 784a873628912abc489eea604b05beaa36827403 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 12 Sep 2024 19:07:19 -0700
Subject: [PATCH 21/81] Correct comments

---
 src/Text/Pandoc/Readers/Mdoc.hs     | 2 +-
 src/Text/Pandoc/Readers/Mdoc/Lex.hs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 6b0954bf5643..46def38d15e3 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -9,7 +9,7 @@
    Stability   : WIP
    Portability : portable
 
-Conversion of man to 'Pandoc' document.
+Conversion of mdoc to 'Pandoc' document.
 -}
 module Text.Pandoc.Readers.Mdoc (readMdoc) where
 
diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
index db58ac27d204..6cb0a1bd2608 100644
--- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -13,7 +13,7 @@
    Stability   : WIP
    Portability : portable
 
-Tokenizer for roff formats (man, ms).
+Tokenizer for mdoc format
 -}
 module Text.Pandoc.Readers.Mdoc.Lex
   ( MdocToken(..)

From 1e8f7b700fed28aba71acd9717e594f28619292d Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 12 Sep 2024 19:21:55 -0700
Subject: [PATCH 22/81] Parse Nm macro (outside Sh NAME)

---
 src/Text/Pandoc/Readers/Mdoc.hs | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 46def38d15e3..ec7498eaf0fd 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -248,18 +248,36 @@ simpleInline nm xform = do
  where
    segment = do
       (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText
-      let xform' x = if null x then mempty else xform x
-      return $ openDelim <> xform' inlines <> closeDelim
+      return $ openDelim <> xform inlines <> closeDelim
 
+eliminateEmpty :: (Inlines -> Inlines) -> Inlines -> Inlines
+eliminateEmpty x y = if null y then mempty else x y
+
+cls :: T.Text -> B.Attr
+cls x = (mempty, [x], mempty)
 
 -- Sy: callable, parsed, >0 arguments
 -- mandoc -T html formats Sy with a <b> tag, since it's not really
 -- semantically <strong>, but Strong is our best option in Pandoc
 parseSy :: PandocMonad m => MdocParser m Inlines
-parseSy = simpleInline "Sy" B.strong
+parseSy = simpleInline "Sy" (eliminateEmpty B.strong)
 
 parseEm :: PandocMonad m => MdocParser m Inlines
-parseEm = simpleInline "Em" B.emph
+parseEm = simpleInline "Em" (eliminateEmpty B.emph)
+
+parseNm :: PandocMonad m => MdocParser m Inlines
+parseNm = do
+  mnm <- (progName <$> getState)
+  case mnm of
+    Nothing -> do
+      (_, nm, _) <- lookAhead $ delimitedArgs $ option mempty litsToText
+      guard $ not (null nm)
+      simpleInline "Nm" ok
+    Just nm -> simpleInline "Nm" $ \x ->
+      if null x
+         then B.codeWith (cls "Nm") nm
+         else ok x
+  where ok = B.codeWith (cls "Nm") . stringify
 
 -- Xr
 parseXr :: PandocMonad m => MdocParser m Inlines
@@ -267,16 +285,15 @@ parseXr = do
   macro "Xr"
   (open, (name, section), close) <- delimitedArgs f
   let ref = name <> "(" <> section <> ")"
-  return $ open <> B.spanWith attr (B.str ref) <> close
+  return $ open <> B.spanWith (cls "Xr") (B.str ref) <> close
     where
       f = do
         n <- lit <?> "Xr manual name"
         s <- lit <?> "Xr manual section"
         return (toString n, toString s)
-      attr = (mempty, ["Xr"], mempty)
 
 parseInlineMacro :: PandocMonad m => MdocParser m Inlines
-parseInlineMacro = choice [ parseSy, parseEm, parseXr ]
+parseInlineMacro = choice [ parseSy, parseEm, parseNm, parseXr ]
 
 -- TODO this doesn't handle inline macros being interrupted
 -- by other ones yet, but the lexer doesn't handle it yet

From 57c97095b23dd84e72f1d3468fdc0b7437812dfb Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 12 Sep 2024 19:23:09 -0700
Subject: [PATCH 23/81] Parse multiple inline macros per control line

---
 src/Text/Pandoc/Readers/Mdoc.hs | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index ec7498eaf0fd..a73a1fcc6783 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -295,11 +295,8 @@ parseXr = do
 parseInlineMacro :: PandocMonad m => MdocParser m Inlines
 parseInlineMacro = choice [ parseSy, parseEm, parseNm, parseXr ]
 
--- TODO this doesn't handle inline macros being interrupted
--- by other ones yet, but the lexer doesn't handle it yet
--- either
 parseInline :: PandocMonad m => MdocParser m Inlines
-parseInline = parseStr <|> (parseInlineMacro <* eol)
+parseInline = parseStr <|> mconcat <$> (many1Till (parseInlineMacro <|> litsToText) eol)
 
 parseInlines :: PandocMonad m => MdocParser m Inlines
 parseInlines = mconcat . intersperse B.space <$> many1 parseInline

From 34a83d91770f4253d1f7844ac81a978a3758082f Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 12 Sep 2024 19:23:27 -0700
Subject: [PATCH 24/81] Add a deprecated macro synonym

---
 src/Text/Pandoc/Readers/Mdoc.hs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index a73a1fcc6783..c41a5a31bf3a 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -298,12 +298,13 @@ parseInlineMacro = choice [ parseSy, parseEm, parseNm, parseXr ]
 parseInline :: PandocMonad m => MdocParser m Inlines
 parseInline = parseStr <|> mconcat <$> (many1Till (parseInlineMacro <|> litsToText) eol)
 
+-- TODO probably need some kind of fold to deal with Ns
 parseInlines :: PandocMonad m => MdocParser m Inlines
 parseInlines = mconcat . intersperse B.space <$> many1 parseInline
 
 parsePara :: PandocMonad m => MdocParser m Blocks
 parsePara = do
-  optional (emptyMacro "Pp")
+  optional (emptyMacro "Pp" <|> emptyMacro "Lp")  -- Lp: deprecated synonym for Pp
   B.para . B.trimInlines <$> parseInlines
 
 -- CodeBlocks can't contain any other markup, but mdoc

From a3a892ab552b0748244bb60503bb2e9e0e714fa5 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sat, 14 Sep 2024 11:26:00 -0700
Subject: [PATCH 25/81] Plan for special handling of AUTHORS

---
 src/Text/Pandoc/Readers/Mdoc.hs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index c41a5a31bf3a..679888473d2e 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -38,6 +38,7 @@ import Text.Pandoc.Shared (stringify)
 data MdocSection
   = ShName
   | ShSynopsis
+  | ShAuthors
   | ShOther
   deriving (Show, Eq)
 
@@ -179,6 +180,7 @@ parsePrologue = do
 shToSectionMode :: T.Text -> MdocSection
 shToSectionMode "NAME" = ShName
 shToSectionMode "SYNOPSIS" = ShSynopsis
+shToSectionMode "AUTHORS" = ShAuthors
 shToSectionMode _ = ShOther
 
 parseHeader :: PandocMonad m => MdocParser m Blocks

From 4b3ecb86d9eb16cf9b169b7c326d5e005cf3802d Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sat, 14 Sep 2024 11:47:35 -0700
Subject: [PATCH 26/81] Correct pasted comment

---
 src/Text/Pandoc/Readers/Mdoc.hs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 679888473d2e..fbcfcafdf665 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -59,7 +59,7 @@ instance Default ManState where
 type MdocParser m = P.ParsecT [MdocToken] ManState m
 
 
--- | Read man (troff) from an input string and return a Pandoc document.
+-- | Read mdoc from an input string and return a Pandoc document.
 readMdoc :: (PandocMonad m, ToSources a)
         => ReaderOptions
         -> a

From 91f24a042a1c299e3e677a63a1c764932f93ff23 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sat, 14 Sep 2024 12:41:13 -0700
Subject: [PATCH 27/81] Add macros for line-scoped enclosures

---
 src/Text/Pandoc/Readers/Mdoc.hs | 52 ++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index fbcfcafdf665..ab14590fd883 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -252,6 +252,12 @@ simpleInline nm xform = do
       (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText
       return $ openDelim <> xform inlines <> closeDelim
 
+lineEnclosure :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines
+lineEnclosure nm xform = do
+  macro nm
+  inner <-  many1 (parseInlineMacro <|> litsToText)
+  return $ (xform . mconcat . intersperse B.space) inner
+
 eliminateEmpty :: (Inlines -> Inlines) -> Inlines -> Inlines
 eliminateEmpty x y = if null y then mempty else x y
 
@@ -267,6 +273,27 @@ parseSy = simpleInline "Sy" (eliminateEmpty B.strong)
 parseEm :: PandocMonad m => MdocParser m Inlines
 parseEm = simpleInline "Em" (eliminateEmpty B.emph)
 
+parseQl :: PandocMonad m => MdocParser m Inlines
+parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify
+
+parseDq :: PandocMonad m => MdocParser m Inlines
+parseDq = lineEnclosure "Dq" B.doubleQuoted
+
+parseSq :: PandocMonad m => MdocParser m Inlines
+parseSq = lineEnclosure "Sq" B.singleQuoted
+
+parsePq :: PandocMonad m => MdocParser m Inlines
+parsePq = lineEnclosure "Pq" $ \x -> "(" <> x <> ")"
+
+parseBq :: PandocMonad m => MdocParser m Inlines
+parseBq = lineEnclosure "Bq" $ \x -> "[" <> x <> "]"
+
+parseBrq :: PandocMonad m => MdocParser m Inlines
+parseBrq = lineEnclosure "Brq" $ \x -> "{" <> x <> "}"
+
+parseAq :: PandocMonad m => MdocParser m Inlines
+parseAq = lineEnclosure "Aq" $ \x -> "⟨" <> x <> "⟩"
+
 parseNm :: PandocMonad m => MdocParser m Inlines
 parseNm = do
   mnm <- (progName <$> getState)
@@ -294,15 +321,32 @@ parseXr = do
         s <- lit <?> "Xr manual section"
         return (toString n, toString s)
 
+-- TODO should possibly rename this function b/c some of these are
+-- Mdoc block partial-implicit macros. Unclear if this distinction
+-- is going to be relevant.
 parseInlineMacro :: PandocMonad m => MdocParser m Inlines
-parseInlineMacro = choice [ parseSy, parseEm, parseNm, parseXr ]
+parseInlineMacro =
+  choice
+    [ parseSy,
+      parseEm,
+      parseNm,
+      parseXr,
+      parseQl,
+      parseSq,
+      parseDq,
+      parsePq,
+      parseBq,
+      parseBrq,
+      parseAq
+    ]
+
+parseInline :: PandocMonad m => MdocParser m [Inlines]
+parseInline = (parseStr >>= return . (:[])) <|> (many1Till (parseInlineMacro <|> litsToText) eol)
 
-parseInline :: PandocMonad m => MdocParser m Inlines
-parseInline = parseStr <|> mconcat <$> (many1Till (parseInlineMacro <|> litsToText) eol)
 
 -- TODO probably need some kind of fold to deal with Ns
 parseInlines :: PandocMonad m => MdocParser m Inlines
-parseInlines = mconcat . intersperse B.space <$> many1 parseInline
+parseInlines = mconcat . intersperse B.space . mconcat <$> many1 parseInline
 
 parsePara :: PandocMonad m => MdocParser m Blocks
 parsePara = do

From 497477d99bfa3b43ebbe9ecbf8ca22fa72846268 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 11:10:47 -0700
Subject: [PATCH 28/81] Add Qq

---
 src/Text/Pandoc/Readers/Mdoc.hs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index ab14590fd883..8571455afa37 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -282,6 +282,9 @@ parseDq = lineEnclosure "Dq" B.doubleQuoted
 parseSq :: PandocMonad m => MdocParser m Inlines
 parseSq = lineEnclosure "Sq" B.singleQuoted
 
+parseQq :: PandocMonad m => MdocParser m Inlines
+parseQq = lineEnclosure "Qq" $ \x -> "\"" <> x <> "\""
+
 parsePq :: PandocMonad m => MdocParser m Inlines
 parsePq = lineEnclosure "Pq" $ \x -> "(" <> x <> ")"
 

From 713192393723b4ac12ebd19c7cc9b208f77c3a99 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 14:16:51 -0700
Subject: [PATCH 29/81] Extract spacify function

This will handle Ns in the future
---
 src/Text/Pandoc/Readers/Mdoc.hs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 8571455afa37..bd7223cd2993 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -163,7 +163,7 @@ argsToInlines :: PandocMonad m => MdocParser m Inlines
 argsToInlines = do
   ls <- manyTill arg eol
   let strs = map (B.str . toString) ls
-  return $ mconcat $ intersperse B.space strs
+  return $ spacify strs
 
 parsePrologue :: PandocMonad m => MdocParser m ()
 parsePrologue = do
@@ -232,7 +232,7 @@ litsToText :: PandocMonad m => MdocParser m Inlines
 litsToText = do
   ls <- many1 lit
   let strs = map (B.str . toString) ls
-  return $ mconcat $ intersperse B.space strs
+  return $ spacify strs
 
 delimitedArgs :: PandocMonad m => MdocParser m x -> MdocParser m (Inlines, x, Inlines)
 delimitedArgs p = do
@@ -246,7 +246,7 @@ simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m
 simpleInline nm xform = do
   macro nm
   segs <- manyTill segment inlineContextEnd
-  return $ mconcat $ intersperse B.space segs
+  return $ spacify segs
  where
    segment = do
       (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText
@@ -258,6 +258,9 @@ lineEnclosure nm xform = do
   inner <-  many1 (parseInlineMacro <|> litsToText)
   return $ (xform . mconcat . intersperse B.space) inner
 
+spacify :: [Inlines] -> Inlines
+spacify = mconcat . intersperse B.space
+
 eliminateEmpty :: (Inlines -> Inlines) -> Inlines -> Inlines
 eliminateEmpty x y = if null y then mempty else x y
 

From 1f5da9a9358b1552f3ab27263ae53be6e367c5a7 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 14:18:29 -0700
Subject: [PATCH 30/81] Rework enclosures and support multiline ones

---
 src/Text/Pandoc/Readers/Mdoc.hs | 42 ++++++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index bd7223cd2993..98d817e09f3a 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -234,6 +234,16 @@ litsToText = do
   let strs = map (B.str . toString) ls
   return $ spacify strs
 
+litsAndDelimsToText :: PandocMonad m => MdocParser m Inlines
+litsAndDelimsToText = do
+  ods <- mconcat <$> many (parseDelim Open)
+  ls <- many lit
+  cds <- mconcat <$> if null ods && null ls
+                        then many1 (parseDelim Close)
+                        else many (parseDelim Close)
+  let strs = map (B.str . toString) ls
+  return $ ods <> spacify strs <> cds
+
 delimitedArgs :: PandocMonad m => MdocParser m x -> MdocParser m (Inlines, x, Inlines)
 delimitedArgs p = do
     openDelim <- mconcat <$> many (parseDelim Open)
@@ -255,12 +265,27 @@ simpleInline nm xform = do
 lineEnclosure :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines
 lineEnclosure nm xform = do
   macro nm
-  inner <-  many1 (parseInlineMacro <|> litsToText)
-  return $ (xform . mconcat . intersperse B.space) inner
+  --- XXX wtf
+  (first, further, finally) <- delimitedArgs
+    (manyTill
+      (parseInlineMacro <|> (try (litsAndDelimsToText <* notFollowedBy eol))
+        <|> litsToText) (lookAhead (many (delim Close) *> eol)))
+  return $ first <> xform (spacify further) <> finally
 
 spacify :: [Inlines] -> Inlines
 spacify = mconcat . intersperse B.space
 
+{- Compatibility note: mandoc permits, and doesn't warn on, "vertical" macros
+ (Pp, Bl/El, Bd/Ed) inside of "horizontal" block partial-explicit quotations
+like Do/Dc. However there are no OpenBSD manual pages that employ such markup
+and it doesn't look right when rendered. We don't attempt to consume anything
+but pandoc inlines inside of these multiline enclosures. -}
+multilineEnclosure :: PandocMonad m => T.Text -> T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines
+multilineEnclosure op cl xform = do
+  macro op
+  (first, further, finally) <- delimitedArgs (manyTill parseInlines (macro cl))
+  return $ first <> xform (spacify further) <> finally
+
 eliminateEmpty :: (Inlines -> Inlines) -> Inlines -> Inlines
 eliminateEmpty x y = if null y then mempty else x y
 
@@ -282,6 +307,9 @@ parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify
 parseDq :: PandocMonad m => MdocParser m Inlines
 parseDq = lineEnclosure "Dq" B.doubleQuoted
 
+parseDo :: PandocMonad m => MdocParser m Inlines
+parseDo = ptrace $ multilineEnclosure "Do" "Dc" B.doubleQuoted
+
 parseSq :: PandocMonad m => MdocParser m Inlines
 parseSq = lineEnclosure "Sq" B.singleQuoted
 
@@ -343,16 +371,18 @@ parseInlineMacro =
       parsePq,
       parseBq,
       parseBrq,
-      parseAq
+      parseAq,
+      parseDo
     ]
 
-parseInline :: PandocMonad m => MdocParser m [Inlines]
-parseInline = (parseStr >>= return . (:[])) <|> (many1Till (parseInlineMacro <|> litsToText) eol)
+parseInline :: PandocMonad m => MdocParser m Inlines
+parseInline = parseStr  <|>
+  ((parseInlineMacro <|> litsAndDelimsToText) <* optional eol)
 
 
 -- TODO probably need some kind of fold to deal with Ns
 parseInlines :: PandocMonad m => MdocParser m Inlines
-parseInlines = mconcat . intersperse B.space . mconcat <$> many1 parseInline
+parseInlines = spacify <$> many1 parseInline
 
 parsePara :: PandocMonad m => MdocParser m Blocks
 parsePara = do

From 10bdda8c86e54a084b471cef092c07f37b68b790 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 14:19:20 -0700
Subject: [PATCH 31/81] fixup: qq

---
 src/Text/Pandoc/Readers/Mdoc.hs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 98d817e09f3a..0a23ea2a7b35 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -368,6 +368,7 @@ parseInlineMacro =
       parseQl,
       parseSq,
       parseDq,
+      parseQq,
       parsePq,
       parseBq,
       parseBrq,

From 41f1454f638933abc04c4d86722c2903fdc6c12c Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 14:36:37 -0700
Subject: [PATCH 32/81] Add Op macro

Which we just treat as another kind of Bq, for now
---
 src/Text/Pandoc/Readers/Mdoc.hs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 0a23ea2a7b35..5fee2d6c044e 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -322,6 +322,12 @@ parsePq = lineEnclosure "Pq" $ \x -> "(" <> x <> ")"
 parseBq :: PandocMonad m => MdocParser m Inlines
 parseBq = lineEnclosure "Bq" $ \x -> "[" <> x <> "]"
 
+-- For our purposes this probably behaves identically to Bq
+-- in most circumstances but I might need to do something
+-- special with it in SYNOPSIS
+parseOp :: PandocMonad m => MdocParser m Inlines
+parseOp = lineEnclosure "Op" $ \x -> "[" <> x <> "]"
+
 parseBrq :: PandocMonad m => MdocParser m Inlines
 parseBrq = lineEnclosure "Brq" $ \x -> "{" <> x <> "}"
 
@@ -366,6 +372,7 @@ parseInlineMacro =
       parseNm,
       parseXr,
       parseQl,
+      parseOp,
       parseSq,
       parseDq,
       parseQq,

From 03d82ba87c932599caa442b4ac17e64e3e5fd649 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 14:37:37 -0700
Subject: [PATCH 33/81] Add partial Ns support

We parse this into a RawInline, which will usually get removed by the
spacify function, once I write the version of this that actually does
that.
---
 src/Text/Pandoc/Readers/Mdoc.hs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 5fee2d6c044e..831d81d53ea5 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -361,6 +361,9 @@ parseXr = do
         s <- lit <?> "Xr manual section"
         return (toString n, toString s)
 
+parseNs :: PandocMonad m => MdocParser m Inlines
+parseNs = macro "Ns" >> return (B.rawInline "mdoc" "Ns")
+
 -- TODO should possibly rename this function b/c some of these are
 -- Mdoc block partial-implicit macros. Unclear if this distinction
 -- is going to be relevant.
@@ -380,7 +383,8 @@ parseInlineMacro =
       parseBq,
       parseBrq,
       parseAq,
-      parseDo
+      parseDo,
+      parseNs
     ]
 
 parseInline :: PandocMonad m => MdocParser m Inlines

From ffe16cceaa17d0f5af144abff298324922a82d0a Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 18:01:12 -0700
Subject: [PATCH 34/81] Lex called macros better

---
 src/Text/Pandoc/Readers/Mdoc/Lex.hs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
index 6cb0a1bd2608..350a4ffb30a2 100644
--- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -149,6 +149,7 @@ lexMacro :: PandocMonad m => Lexer m MdocToken
 lexMacro = do
   pos <- getPosition
   name <- many1Char (satisfy isMacroChar)
+  eof <|> void (lookAhead (spaceChar <|> newline))
   skipSpaces
   return $ Macro name pos
   where

From 87097af014e3a9c1e50e6727417a1988f76727d4 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 18:01:36 -0700
Subject: [PATCH 35/81] Lex inner quotes in quoted args correctly

---
 src/Text/Pandoc/Readers/Mdoc/Lex.hs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
index 350a4ffb30a2..ac3a753aac06 100644
--- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -124,7 +124,7 @@ spaceTabChar = T.singleton <$> spaceChar
 quotedArg :: PandocMonad m => Lexer m T.Text
 quotedArg = do
   quoteChar
-  t <- mconcat <$> many (escape <|> regularText <|> innerQuote <|> spaceTabChar)
+  t <- mconcat <$> many (try innerQuote <|> escape <|> regularText <|> spaceTabChar)
   quoteChar
   notFollowedBy quoteChar
   return t

From fbd18a24b04fe6d7c5e36a6375fc4ef3846230bd Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 21:07:35 -0700
Subject: [PATCH 36/81] Handle the Ns macro

---
 src/Text/Pandoc/Readers/Mdoc.hs | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 831d81d53ea5..254932790764 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -272,8 +272,28 @@ lineEnclosure nm xform = do
         <|> litsToText) (lookAhead (many (delim Close) *> eol)))
   return $ first <> xform (spacify further) <> finally
 
+noSpace :: Inlines
+noSpace = B.rawInline "mdoc" "Ns"
+
+data SpacifyState = SpacifyState { accum :: [Inlines], prev :: Inlines, ns :: Bool }
+instance Default SpacifyState where def = SpacifyState [] mempty False
+
+foldNoSpaces :: [Inlines] -> [Inlines]
+foldNoSpaces xs = (finalize . foldl go def) xs
+  where
+    go :: SpacifyState -> Inlines -> SpacifyState
+    go s x
+      | ns s && x == noSpace = s
+      | ns s                 = s{prev = prev s <> x, ns = False}
+      |         x == noSpace = s{ns = True}
+      | null (prev s)        = s{prev = x}
+      | otherwise            = s{accum = accum s <> [prev s], prev = x}
+    finalize s
+      | null (prev s) = accum s
+      | otherwise     = accum s <> [prev s]
+
 spacify :: [Inlines] -> Inlines
-spacify = mconcat . intersperse B.space
+spacify = mconcat . intersperse B.space . foldNoSpaces
 
 {- Compatibility note: mandoc permits, and doesn't warn on, "vertical" macros
  (Pp, Bl/El, Bd/Ed) inside of "horizontal" block partial-explicit quotations
@@ -362,7 +382,7 @@ parseXr = do
         return (toString n, toString s)
 
 parseNs :: PandocMonad m => MdocParser m Inlines
-parseNs = macro "Ns" >> return (B.rawInline "mdoc" "Ns")
+parseNs = macro "Ns" >> return noSpace
 
 -- TODO should possibly rename this function b/c some of these are
 -- Mdoc block partial-implicit macros. Unclear if this distinction

From f9888047294f4a2bfa4e0af1a8be822da03a2826 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 21:08:11 -0700
Subject: [PATCH 37/81] fixup: Remove Do trace

---
 src/Text/Pandoc/Readers/Mdoc.hs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 254932790764..8977e3d6b3f3 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -328,7 +328,7 @@ parseDq :: PandocMonad m => MdocParser m Inlines
 parseDq = lineEnclosure "Dq" B.doubleQuoted
 
 parseDo :: PandocMonad m => MdocParser m Inlines
-parseDo = ptrace $ multilineEnclosure "Do" "Dc" B.doubleQuoted
+parseDo =  multilineEnclosure "Do" "Dc" B.doubleQuoted
 
 parseSq :: PandocMonad m => MdocParser m Inlines
 parseSq = lineEnclosure "Sq" B.singleQuoted

From 83a523d23b15b330057aaefefa08105ba58411da Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 21:09:04 -0700
Subject: [PATCH 38/81] Lex quoted delimiters on control lines

---
 src/Text/Pandoc/Readers/Mdoc/Lex.hs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
index ac3a753aac06..fe50f64f02ac 100644
--- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -24,11 +24,12 @@ module Text.Pandoc.Readers.Mdoc.Lex
   )
 where
 
-import Control.Monad (void, guard)
+import Control.Monad (void, guard, when)
 import Control.Monad.Except (throwError)
 import Text.Pandoc.Class.PandocMonad (PandocMonad(..))
 import Data.Char (isAlphaNum)
 import Data.Default (Default)
+import Data.Maybe (isJust)
 import qualified Data.Text as T
 import Text.Pandoc.Options
 import Text.Pandoc.Parsing
@@ -165,7 +166,9 @@ lexCallableMacro = do
 lexDelim :: PandocMonad m => Lexer m MdocToken
 lexDelim = do
   pos <- getPosition
+  q <- optionMaybe quoteChar
   t <- Delim Open <$> oneOfStrings ["(", "["] <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"]
+  when (isJust q) (void quoteChar)
   eof <|> void (lookAhead (spaceChar <|> newline))
   skipSpaces
   return $ t pos

From 6ed3d8387f25d5267aed4b0086159d57f7f799e1 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 21:09:39 -0700
Subject: [PATCH 39/81] Start Mdoc tests

---
 pandoc.cabal               |  1 +
 test/Tests/Readers/Mdoc.hs | 71 ++++++++++++++++++++++++++++++++++++++
 test/test-pandoc.hs        |  2 ++
 3 files changed, 74 insertions(+)
 create mode 100644 test/Tests/Readers/Mdoc.hs

diff --git a/pandoc.cabal b/pandoc.cabal
index 39508116c1fe..e36eb4a9ed1f 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -827,6 +827,7 @@ test-suite test-pandoc
                   Tests.Readers.Muse
                   Tests.Readers.Creole
                   Tests.Readers.Man
+                  Tests.Readers.Mdoc
                   Tests.Readers.FB2
                   Tests.Readers.DokuWiki
                   Tests.Writers.Native
diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
new file mode 100644
index 000000000000..24a535f4ae5f
--- /dev/null
+++ b/test/Tests/Readers/Mdoc.hs
@@ -0,0 +1,71 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Tests.Readers.Mdoc
+   Copyright   : © 2024 Evan Silberman
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : 
+   Stability   : alpha
+   Portability : portable
+
+Tests for the Mdoc reader.
+-}
+
+module Tests.Readers.Mdoc (tests) where
+
+import Data.Text (Text)
+import qualified Data.Text as T
+import Test.Tasty
+import Test.Tasty.HUnit (HasCallStack)
+import Tests.Helpers
+import Text.Pandoc
+import Text.Pandoc.Arbitrary ()
+import Text.Pandoc.Builder
+
+mdoc :: Text -> Pandoc
+mdoc = purely $ readMdoc def
+
+infix 4 =:
+(=:) :: (ToString c, HasCallStack)
+     => String -> (Text, c) -> TestTree
+(=:) = test mdoc
+
+tests :: [TestTree]
+tests = [
+  testGroup "one-line enclosures"
+    [ "Dq" =:
+        ".Dq hello world" =?>
+        para (doubleQuoted "hello world")
+    , "Sq" =:
+        ".Sq hello world" =?>
+        para (singleQuoted "hello world")
+    ]
+  , testGroup "inlines"
+    [ "Sy" =:
+        ".Sy hello world" =?>
+        para (strong "hello world")
+    , "delimiters" =:
+        ".Sy ( hello world )" =?>
+        para (mconcat ["(", strong "hello world", ")"])
+    , "multiple" =:
+        ".Sy hello Em world" =?>
+        para (strong "hello" <> space <> emph "world")
+    ]
+  , testGroup "Ns macro"
+    [ "at the beginning of a macro line (mandoc delta)" =:
+        T.unlines [".Op before", ".Ns Op after"] =?>
+        para "[before][after]"
+    , "after a block closing macro" =:
+        T.unlines [".Oo before", ".Oc Ns Op after"] =?>
+        para "[before][after]"
+    , "in the middle of a macro line" =:
+        ".Oo before Oc Ns Op after" =?>
+        para "[before][after]"
+    , "at the end of a macro line" =:
+        T.unlines [".Oo before Oc Ns", ".Op after"] =?>
+        para "[before][after]"
+    , "at the end of a partial-implicit line" =:
+        T.unlines [".Op before Ns", ".Op after"] =?>
+        para "[before][after]"
+    ]
+  ]
diff --git a/test/test-pandoc.hs b/test/test-pandoc.hs
index d8c78276b3ec..6d749b84549f 100644
--- a/test/test-pandoc.hs
+++ b/test/test-pandoc.hs
@@ -29,6 +29,7 @@ import qualified Tests.Readers.RST
 import qualified Tests.Readers.RTF
 import qualified Tests.Readers.Txt2Tags
 import qualified Tests.Readers.Man
+import qualified Tests.Readers.Mdoc
 import qualified Tests.Shared
 import qualified Tests.Writers.AsciiDoc
 import qualified Tests.Writers.ConTeXt
@@ -97,6 +98,7 @@ tests pandocPath = testGroup "pandoc tests"
           , testGroup "Muse" Tests.Readers.Muse.tests
           , testGroup "Creole" Tests.Readers.Creole.tests
           , testGroup "Man" Tests.Readers.Man.tests
+          , testGroup "Mdoc" Tests.Readers.Mdoc.tests
           , testGroup "FB2" Tests.Readers.FB2.tests
           , testGroup "DokuWiki" Tests.Readers.DokuWiki.tests
           ]

From 12f4c49ae77032949b46f28cccc09fef79080804 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 21:10:20 -0700
Subject: [PATCH 40/81] Use lineEnclosure to parse Sh/Ss

---
 src/Text/Pandoc/Readers/Mdoc.hs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 8977e3d6b3f3..e1cf9970aead 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -185,8 +185,9 @@ shToSectionMode _ = ShOther
 
 parseHeader :: PandocMonad m => MdocParser m Blocks
 parseHeader = do
-  (Macro m _) <- macro "Sh" <|> macro "Ss"
-  txt <- argsToInlines
+  (Macro m _) <- lookAhead $ macro "Sh" <|> macro "Ss"
+  txt <- lineEnclosure m id
+  eol
   let lvl = if m == "Sh" then 1 else 2
   when (lvl == 1) $ modifyState $ \s -> s{currentSection = (shToSectionMode . stringify) txt}
   return $ B.header lvl txt

From 3691891c46a4322f13c76889158a2699a0abdd84 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 21:10:31 -0700
Subject: [PATCH 41/81] Add Oo/Op macro

---
 src/Text/Pandoc/Readers/Mdoc.hs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index e1cf9970aead..ea6639ed9627 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -349,6 +349,9 @@ parseBq = lineEnclosure "Bq" $ \x -> "[" <> x <> "]"
 parseOp :: PandocMonad m => MdocParser m Inlines
 parseOp = lineEnclosure "Op" $ \x -> "[" <> x <> "]"
 
+parseOo :: PandocMonad m => MdocParser m Inlines
+parseOo =  multilineEnclosure "Oo" "Oc" $ \x -> "[" <> x <> "]"
+
 parseBrq :: PandocMonad m => MdocParser m Inlines
 parseBrq = lineEnclosure "Brq" $ \x -> "{" <> x <> "}"
 
@@ -405,6 +408,7 @@ parseInlineMacro =
       parseBrq,
       parseAq,
       parseDo,
+      parseOo,
       parseNs
     ]
 

From f8d263fe910a63b20070aa60d6c86bbacd1f8f34 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 21:14:08 -0700
Subject: [PATCH 42/81] Handle Ns at end of control line correctly

---
 src/Text/Pandoc/Readers/Mdoc.hs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index ea6639ed9627..25d0618056b2 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -270,7 +270,7 @@ lineEnclosure nm xform = do
   (first, further, finally) <- delimitedArgs
     (manyTill
       (parseInlineMacro <|> (try (litsAndDelimsToText <* notFollowedBy eol))
-        <|> litsToText) (lookAhead (many (delim Close) *> eol)))
+        <|> litsToText) (lookAhead (many (macro "Ns" <|> delim Close) *> eol)))
   return $ first <> xform (spacify further) <> finally
 
 noSpace :: Inlines

From e6d6dd97cc72e00777f81c8e4401b576d72fdcf3 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 21:39:49 -0700
Subject: [PATCH 43/81] Add the No macro

---
 src/Text/Pandoc/Readers/Mdoc.hs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 25d0618056b2..6803fe6f2b91 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -322,6 +322,9 @@ parseSy = simpleInline "Sy" (eliminateEmpty B.strong)
 parseEm :: PandocMonad m => MdocParser m Inlines
 parseEm = simpleInline "Em" (eliminateEmpty B.emph)
 
+parseNo :: PandocMonad m => MdocParser m Inlines
+parseNo = simpleInline "No" (eliminateEmpty id)
+
 parseQl :: PandocMonad m => MdocParser m Inlines
 parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify
 
@@ -396,6 +399,7 @@ parseInlineMacro =
   choice
     [ parseSy,
       parseEm,
+      parseNo,
       parseNm,
       parseXr,
       parseQl,

From d319de0c5b1ce9a17fa6ada0f365e0793b01b9e8 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 21:40:21 -0700
Subject: [PATCH 44/81] Lex quoted callable macros as macros

---
 src/Text/Pandoc/Readers/Mdoc/Lex.hs | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
index fe50f64f02ac..b49a6e7845ca 100644
--- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -146,22 +146,30 @@ quoteChar = T.singleton <$> char '"'
 mdocToken :: PandocMonad m => Lexer m MdocTokens
 mdocToken = lexComment <|> lexControlLine <|> lexTextLine
 
+lexMacroName :: PandocMonad m => Lexer m T.Text
+lexMacroName = many1Char (satisfy isMacroChar)
+  where
+    isMacroChar '%' = True
+    isMacroChar x = isAlphaNum x
+
 lexMacro :: PandocMonad m => Lexer m MdocToken
 lexMacro = do
   pos <- getPosition
-  name <- many1Char (satisfy isMacroChar)
+  name <- lexMacroName
   eof <|> void (lookAhead (spaceChar <|> newline))
   skipSpaces
   return $ Macro name pos
-  where
-    isMacroChar '%' = True
-    isMacroChar x = isAlphaNum x
 
 lexCallableMacro :: PandocMonad m => Lexer m MdocToken
 lexCallableMacro = do
-  m@(Macro name _) <- lexMacro
+  pos <- getPosition
+  q <- optionMaybe quoteChar
+  name <- lexMacroName
+  when (isJust q) (void quoteChar)
+  eof <|> void (lookAhead (spaceChar <|> newline))
+  skipSpaces
   guard $ isCallableMacro name
-  return m
+  return $ Macro name pos
 
 lexDelim :: PandocMonad m => Lexer m MdocToken
 lexDelim = do

From 30e3fce90c50f35597c8e2b6013e8edb28443689 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 21:40:59 -0700
Subject: [PATCH 45/81] Convert some more Ns tests from mandoc

---
 test/Tests/Readers/Mdoc.hs | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index 24a535f4ae5f..b45a85518773 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -61,11 +61,26 @@ tests = [
     , "in the middle of a macro line" =:
         ".Oo before Oc Ns Op after" =?>
         para "[before][after]"
+    , "before closing punctuation" =:
+        ".Oo before Oc Ns : Op after" =?>
+        para "[before]: [after]"
+    , "after closing punctuation" =:
+        ".Oo before Oc : Ns Op after" =?>
+        para "[before]:[after]"
     , "at the end of a macro line" =:
         T.unlines [".Oo before Oc Ns", ".Op after"] =?>
         para "[before][after]"
     , "at the end of a partial-implicit line" =:
         T.unlines [".Op before Ns", ".Op after"] =?>
         para "[before][after]"
+    , "normal words" =:
+        ".No no Ns ns No no" =?>
+        para ("nons" <> space <> "no")
+    , "opening punctuation" =:
+        ".No no Ns \"(\" ns No no" =?>
+        para ("no(ns" <> space <> "no")
+    , "closing punctuation" =:
+        ".No no \"Ns\" ns \")\" No no" =?>
+        para ("nons)" <> space <> "no")
     ]
   ]

From 608bbb2e8c949ccd4bb709fb041bed6fe8ba7c33 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 22:15:47 -0700
Subject: [PATCH 46/81] Add more mandoc regress tests

---
 test/Tests/Readers/Mdoc.hs | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index b45a85518773..c2b11930fa55 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -39,6 +39,9 @@ tests = [
     , "Sq" =:
         ".Sq hello world" =?>
         para (singleQuoted "hello world")
+    , "empty" =:
+        ".Dq" =?>
+        para (doubleQuoted mempty)
     ]
   , testGroup "inlines"
     [ "Sy" =:
@@ -54,7 +57,7 @@ tests = [
   , testGroup "Ns macro"
     [ "at the beginning of a macro line (mandoc delta)" =:
         T.unlines [".Op before", ".Ns Op after"] =?>
-        para "[before][after]"
+        para "[before][after]"  -- mandoc: warning + "[before] [after]"
     , "after a block closing macro" =:
         T.unlines [".Oo before", ".Oc Ns Op after"] =?>
         para "[before][after]"
@@ -63,7 +66,7 @@ tests = [
         para "[before][after]"
     , "before closing punctuation" =:
         ".Oo before Oc Ns : Op after" =?>
-        para "[before]: [after]"
+        para "[before]: [after]"  -- mandoc: warning
     , "after closing punctuation" =:
         ".Oo before Oc : Ns Op after" =?>
         para "[before]:[after]"
@@ -83,4 +86,32 @@ tests = [
         ".No no \"Ns\" ns \")\" No no" =?>
         para ("nons)" <> space <> "no")
     ]
+  , testGroup "inline punctuation"
+    [ testGroup "leading punctuation"
+      [ "open paren"           =: ".Em ( b"     =?> para ("(" <> emph "b")
+      , "open square bracket"  =: ".Em \"[\" b" =?> para ("[" <> emph "b")
+      -- , "pipe"                 =: ".Em | b"     =?> para ("|" <> space <> emph "b")
+      , "period"               =: ".Em . b"     =?> para ("." <> space <> emph "b")
+      , "comma"                =: ".Em , b"     =?> para ("," <> space <> emph "b")
+      , "semicolon"            =: ".Em ; b"     =?> para (";" <> space <> emph "b")
+      , "colon"                =: ".Em : b"     =?> para (":" <> space <> emph "b")
+      , "question mark"        =: ".Em ? b"     =?> para ("?" <> space <> emph "b")
+      , "exclamation"          =: ".Em ! b"     =?> para ("!" <> space <> emph "b")
+      , "close paren"          =: ".Em ) b"     =?> para (")" <> space <> emph "b")
+      , "close square bracket" =: ".Em \"]\" b" =?> para ("]" <> space <> emph "b")
+      ]
+    , testGroup "trailing punctuation"
+      [ "open paren"           =: ".Em a ("     =?> para (emph "a" <> space <> "(")
+      , "open square bracket"  =: ".Em a ["     =?> para (emph "a" <> space <> "[")
+      -- , "pipe"  =: ".Em a |"     =?> para (emph "a" <> space <> "|")
+      , "period"               =: ".Em a ."     =?> para (emph "a" <> ".")
+      , "comma"                =: ".Em a ,"     =?> para (emph "a" <> ",")
+      , "semicolon"            =: ".Em a ;"     =?> para (emph "a" <> ";")
+      , "colon"                =: ".Em a :"     =?> para (emph "a" <> ":")
+      , "question mark"        =: ".Em a ?"     =?> para (emph "a" <> "?")
+      , "exclamation"          =: ".Em a !"     =?> para (emph "a" <> "!")
+      , "close parens"         =: ".Em a \")\"" =?> para (emph "a" <> ")")
+      , "close square bracket" =: ".Em a ]"     =?> para (emph "a" <> "]")
+      ]
+    ]
   ]

From bb3db9c72d75b75f54d702a44ca5be6d45338ca4 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 22:39:41 -0700
Subject: [PATCH 47/81] Add remaining multiline enclosures

---
 src/Text/Pandoc/Readers/Mdoc.hs | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 6803fe6f2b91..7990a168eaf5 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -337,15 +337,27 @@ parseDo =  multilineEnclosure "Do" "Dc" B.doubleQuoted
 parseSq :: PandocMonad m => MdocParser m Inlines
 parseSq = lineEnclosure "Sq" B.singleQuoted
 
+parseSo :: PandocMonad m => MdocParser m Inlines
+parseSo =  multilineEnclosure "So" "Sc" B.singleQuoted
+
 parseQq :: PandocMonad m => MdocParser m Inlines
 parseQq = lineEnclosure "Qq" $ \x -> "\"" <> x <> "\""
 
+parseQo :: PandocMonad m => MdocParser m Inlines
+parseQo =  multilineEnclosure "Qo" "Qc" $ \x -> "\"" <> x <> "\""
+
 parsePq :: PandocMonad m => MdocParser m Inlines
 parsePq = lineEnclosure "Pq" $ \x -> "(" <> x <> ")"
 
+parsePo :: PandocMonad m => MdocParser m Inlines
+parsePo = multilineEnclosure "Po" "Pc" $ \x -> "(" <> x <> ")"
+
 parseBq :: PandocMonad m => MdocParser m Inlines
 parseBq = lineEnclosure "Bq" $ \x -> "[" <> x <> "]"
 
+parseBo :: PandocMonad m => MdocParser m Inlines
+parseBo = multilineEnclosure "Bo" "Bc" $ \x -> "[" <> x <> "]"
+
 -- For our purposes this probably behaves identically to Bq
 -- in most circumstances but I might need to do something
 -- special with it in SYNOPSIS
@@ -358,9 +370,15 @@ parseOo =  multilineEnclosure "Oo" "Oc" $ \x -> "[" <> x <> "]"
 parseBrq :: PandocMonad m => MdocParser m Inlines
 parseBrq = lineEnclosure "Brq" $ \x -> "{" <> x <> "}"
 
+parseBro :: PandocMonad m => MdocParser m Inlines
+parseBro = multilineEnclosure "Bro" "Brc" $ \x -> "{" <> x <> "}"
+
 parseAq :: PandocMonad m => MdocParser m Inlines
 parseAq = lineEnclosure "Aq" $ \x -> "⟨" <> x <> "⟩"
 
+parseAo :: PandocMonad m => MdocParser m Inlines
+parseAo = multilineEnclosure "Ao" "Ac" $ \x -> "⟨" <> x <> "⟩"
+
 parseNm :: PandocMonad m => MdocParser m Inlines
 parseNm = do
   mnm <- (progName <$> getState)
@@ -411,7 +429,13 @@ parseInlineMacro =
       parseBq,
       parseBrq,
       parseAq,
+      parseSo,
       parseDo,
+      parseQo,
+      parsePo,
+      parseBo,
+      parseBro,
+      parseAo,
       parseOo,
       parseNs
     ]

From 07ea56e051c5debb16c6f9cb2dec487d9c41302f Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 23:07:01 -0700
Subject: [PATCH 48/81] Fix actual multiline enclosures

Consumption of eol on the line for the opening macro got lost. Add a
couple tests. Some irrelevant tests in this commit. Sorry!!
---
 src/Text/Pandoc/Readers/Mdoc.hs |  9 +++++++--
 test/Tests/Readers/Mdoc.hs      | 17 +++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 7990a168eaf5..9deef65d168a 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -304,8 +304,13 @@ but pandoc inlines inside of these multiline enclosures. -}
 multilineEnclosure :: PandocMonad m => T.Text -> T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines
 multilineEnclosure op cl xform = do
   macro op
-  (first, further, finally) <- delimitedArgs (manyTill parseInlines (macro cl))
-  return $ first <> xform (spacify further) <> finally
+  openDelim <- mconcat <$> many (parseDelim Open)
+  optional eol
+  contents <- many parseInline
+  (macro cl <?> show cl)
+  closeDelim <- mconcat <$> many (parseDelim Close)
+  optional eol
+  return $ openDelim <> xform (spacify contents) <> closeDelim
 
 eliminateEmpty :: (Inlines -> Inlines) -> Inlines -> Inlines
 eliminateEmpty x y = if null y then mempty else x y
diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index c2b11930fa55..12b53c26e71d 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -42,11 +42,28 @@ tests = [
     , "empty" =:
         ".Dq" =?>
         para (doubleQuoted mempty)
+    , "nested" =:
+        ".Dq Pq hello world" =?>
+        para (doubleQuoted "(hello world)")
+    , "with inlines" =:
+        ".Dq hello Sy world ." =?>
+        para (doubleQuoted ("hello" <> space <> strong "world" <> "."))
+    ]
+  , testGroup "multiline enclosures"
+    [ "nested multiline" =:
+        T.unlines [".Bo", ".Po", "hi", ".Pc", ".Bc"] =?>
+        para ("[(hi)]")
+    , "nested on one line" =:
+        ".Bo Po hi Pc Bc" =?>
+        para ("[(hi)]")
     ]
   , testGroup "inlines"
     [ "Sy" =:
         ".Sy hello world" =?>
         para (strong "hello world")
+    , "Em" =:
+        ".Em hello world" =?>
+        para (emph "hello world")
     , "delimiters" =:
         ".Sy ( hello world )" =?>
         para (mconcat ["(", strong "hello world", ")"])

From ee6be32434be8143bcbcd06ac7b764c8bb187520 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 23:08:06 -0700
Subject: [PATCH 49/81] Remove YAGNI parseLit

I will probably actually need it later, but not now
---
 src/Text/Pandoc/Readers/Mdoc.hs | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 9deef65d168a..b7e3f5eccc7e 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -219,11 +219,6 @@ parseStr = do
   (Str txt _) <- str
   return $ B.str txt
 
-parseLit :: PandocMonad m => MdocParser m Inlines
-parseLit = do
-  (Lit txt _) <- lit
-  return $ B.str txt
-
 parseDelim :: PandocMonad m => DelimSide -> MdocParser m Inlines
 parseDelim pos = do
   (Delim _ txt _) <- delim pos

From ebef1a2fc0eacfad3dac78acaa74e19646c9ece6 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Sun, 15 Sep 2024 23:24:16 -0700
Subject: [PATCH 50/81] fixup: remaining multiline enclosures

---
 src/Text/Pandoc/Readers/Mdoc.hs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index b7e3f5eccc7e..6eb6c30436d6 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -344,7 +344,7 @@ parseQq :: PandocMonad m => MdocParser m Inlines
 parseQq = lineEnclosure "Qq" $ \x -> "\"" <> x <> "\""
 
 parseQo :: PandocMonad m => MdocParser m Inlines
-parseQo =  multilineEnclosure "Qo" "Qc" $ \x -> "\"" <> x <> "\""
+parseQo = multilineEnclosure "Qo" "Qc" $ \x -> "\"" <> x <> "\""
 
 parsePq :: PandocMonad m => MdocParser m Inlines
 parsePq = lineEnclosure "Pq" $ \x -> "(" <> x <> ")"

From 8664d8e9314592ed3ee318babd80e34c82eba5ad Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Mon, 16 Sep 2024 10:14:18 -0700
Subject: [PATCH 51/81] Handle middle delimiters (i.e. the pipe)

---
 src/Text/Pandoc/Readers/Mdoc.hs     | 17 ++++++++++-------
 src/Text/Pandoc/Readers/Mdoc/Lex.hs |  7 +++++--
 test/Tests/Readers/Mdoc.hs          |  4 ++--
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 6eb6c30436d6..1b75f57d952b 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -232,20 +232,23 @@ litsToText = do
 
 litsAndDelimsToText :: PandocMonad m => MdocParser m Inlines
 litsAndDelimsToText = do
-  ods <- mconcat <$> many (parseDelim Open)
-  ls <- many lit
-  cds <- mconcat <$> if null ods && null ls
-                        then many1 (parseDelim Close)
-                        else many (parseDelim Close)
+  (o, ls, c) <- delimitedArgs $ many lit
+  guard $ not (null o && null ls && null c)
   let strs = map (B.str . toString) ls
-  return $ ods <> spacify strs <> cds
+  return $ o <> spacify strs <> c
 
 delimitedArgs :: PandocMonad m => MdocParser m x -> MdocParser m (Inlines, x, Inlines)
 delimitedArgs p = do
     openDelim <- mconcat <$> many (parseDelim Open)
+    omids <- spacify <$> many (parseDelim Middle)
+    let omid | null omids = mempty
+             | otherwise = omids <> B.space
     inlines <- p
+    cmids <- spacify <$> many (parseDelim Middle)
+    let cmid | null cmids = mempty
+             | otherwise = B.space <> cmids
     closeDelim <- mconcat <$> many (parseDelim Close)
-    return (openDelim, inlines, closeDelim)
+    return (openDelim <> omid, inlines, cmid <> closeDelim)
 
 -- TODO extract further?
 simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines
diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
index b49a6e7845ca..3cf741101a02 100644
--- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -171,11 +171,14 @@ lexCallableMacro = do
   guard $ isCallableMacro name
   return $ Macro name pos
 
-lexDelim :: PandocMonad m => Lexer m MdocToken
+lexDelim :: (PandocMonad m) => Lexer m MdocToken
 lexDelim = do
   pos <- getPosition
   q <- optionMaybe quoteChar
-  t <- Delim Open <$> oneOfStrings ["(", "["] <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"]
+  t <-
+    Delim Open <$> oneOfStrings ["(", "["]
+      <|> Delim Close <$> oneOfStrings [".", ",", ":", ";", ")", "]", "?", "!"]
+      <|> Delim Middle <$> textStr "|"
   when (isJust q) (void quoteChar)
   eof <|> void (lookAhead (spaceChar <|> newline))
   skipSpaces
diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index 12b53c26e71d..ba671d871a9b 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -107,7 +107,7 @@ tests = [
     [ testGroup "leading punctuation"
       [ "open paren"           =: ".Em ( b"     =?> para ("(" <> emph "b")
       , "open square bracket"  =: ".Em \"[\" b" =?> para ("[" <> emph "b")
-      -- , "pipe"                 =: ".Em | b"     =?> para ("|" <> space <> emph "b")
+      , "pipe"                 =: ".Em | b"     =?> para ("|" <> space <> emph "b")
       , "period"               =: ".Em . b"     =?> para ("." <> space <> emph "b")
       , "comma"                =: ".Em , b"     =?> para ("," <> space <> emph "b")
       , "semicolon"            =: ".Em ; b"     =?> para (";" <> space <> emph "b")
@@ -120,7 +120,7 @@ tests = [
     , testGroup "trailing punctuation"
       [ "open paren"           =: ".Em a ("     =?> para (emph "a" <> space <> "(")
       , "open square bracket"  =: ".Em a ["     =?> para (emph "a" <> space <> "[")
-      -- , "pipe"  =: ".Em a |"     =?> para (emph "a" <> space <> "|")
+      , "pipe"                 =: ".Em a |"     =?> para (emph "a" <> space <> "|")
       , "period"               =: ".Em a ."     =?> para (emph "a" <> ".")
       , "comma"                =: ".Em a ,"     =?> para (emph "a" <> ",")
       , "semicolon"            =: ".Em a ;"     =?> para (emph "a" <> ";")

From 35cd77beb251ac0e2a7961ded4415d0fcdef9de8 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Mon, 16 Sep 2024 10:26:30 -0700
Subject: [PATCH 52/81] Add middle punctuation tests from mandoc regress

---
 test/Tests/Readers/Mdoc.hs | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index ba671d871a9b..be154635aa54 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -130,5 +130,18 @@ tests = [
       , "close parens"         =: ".Em a \")\"" =?> para (emph "a" <> ")")
       , "close square bracket" =: ".Em a ]"     =?> para (emph "a" <> "]")
       ]
+    , testGroup "middle punctuation"
+      [ "open paren"           =: ".Em a ( b"     =?> para (mconcat [emph "a", space, "(", emph "b"])
+      , "open square bracket"  =: ".Em a [ b"     =?> para (mconcat [emph "a", space, "[", emph "b"])
+      , "pipe"                 =: ".Em a \"|\" b" =?> para (mconcat [emph "a", space, "|", space, emph "b"])
+      , "period"               =: ".Em a . b"     =?> para (mconcat [emph "a", ".", space, emph "b"])
+      , "comma"                =: ".Em a , b"     =?> para (mconcat [emph "a", ",", space, emph "b"])
+      , "semicolon"            =: ".Em a ; b"     =?> para (mconcat [emph "a", ";", space, emph "b"])
+      , "colon"                =: ".Em a \":\" b" =?> para (mconcat [emph "a", ":", space, emph "b"])
+      , "question mark"        =: ".Em a ? b"     =?> para (mconcat [emph "a", "?", space, emph "b"])
+      , "exclamation"          =: ".Em a ! b"     =?> para (mconcat [emph "a", "!", space, emph "b"])
+      , "close paren"          =: ".Em a ) b"     =?> para (mconcat [emph "a", ")", space, emph "b"])
+      , "close square bracket" =: ".Em a ] b"     =?> para (mconcat [emph "a", "]", space, emph "b"])
+      ]
     ]
   ]

From f3b8fee8acf65825af04a438b7224550e5645f85 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Mon, 16 Sep 2024 12:52:45 -0700
Subject: [PATCH 53/81] fixup: Spacing enclosures

---
 src/Text/Pandoc/Readers/Mdoc.hs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 1b75f57d952b..f14d88937c2d 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -335,13 +335,13 @@ parseDq :: PandocMonad m => MdocParser m Inlines
 parseDq = lineEnclosure "Dq" B.doubleQuoted
 
 parseDo :: PandocMonad m => MdocParser m Inlines
-parseDo =  multilineEnclosure "Do" "Dc" B.doubleQuoted
+parseDo = multilineEnclosure "Do" "Dc" B.doubleQuoted
 
 parseSq :: PandocMonad m => MdocParser m Inlines
 parseSq = lineEnclosure "Sq" B.singleQuoted
 
 parseSo :: PandocMonad m => MdocParser m Inlines
-parseSo =  multilineEnclosure "So" "Sc" B.singleQuoted
+parseSo = multilineEnclosure "So" "Sc" B.singleQuoted
 
 parseQq :: PandocMonad m => MdocParser m Inlines
 parseQq = lineEnclosure "Qq" $ \x -> "\"" <> x <> "\""

From 81504eaeb92e68600bab4abd46092c7c545b6734 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Mon, 16 Sep 2024 13:01:59 -0700
Subject: [PATCH 54/81] Add Lk support for hyperlinks

Refactor delimiter parsing to support special needs. Could end up
unified again at some point if I think harder.
---
 src/Text/Pandoc/Readers/Mdoc.hs | 72 ++++++++++++++++++++++++++-------
 test/Tests/Readers/Mdoc.hs      | 13 +++++-
 2 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index f14d88937c2d..e3107b31ac31 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -149,15 +149,18 @@ eol = void $ msatisfy t where
   t Eol{} = True
   t _ = False
 
+newControlContext :: MdocToken -> Bool
+newControlContext Eol{} = True
+newControlContext Macro{} = True
+newControlContext Str{} = True
+newControlContext Tbl{} = True
+newControlContext Blank{} = True
+newControlContext Lit{} = False
+newControlContext Delim{} = False
+
+
 inlineContextEnd :: PandocMonad m => MdocParser m ()
-inlineContextEnd = eof <|> (void . lookAhead $ msatisfy t) where
-  t Eol{} = True
-  t Macro{} = True
-  t Str{} = True -- shouldn't be lexed
-  t Tbl{} = True -- shouldn't be lexed
-  t Blank{} = True -- shouldn't be lexed
-  t Lit{} = False
-  t Delim{} = False
+inlineContextEnd = eof <|> (void . lookAhead $ msatisfy newControlContext)
 
 argsToInlines :: PandocMonad m => MdocParser m Inlines
 argsToInlines = do
@@ -237,18 +240,31 @@ litsAndDelimsToText = do
   let strs = map (B.str . toString) ls
   return $ o <> spacify strs <> c
 
-delimitedArgs :: PandocMonad m => MdocParser m x -> MdocParser m (Inlines, x, Inlines)
-delimitedArgs p = do
+openingDelimiters :: PandocMonad m => MdocParser m Inlines
+openingDelimiters = do
     openDelim <- mconcat <$> many (parseDelim Open)
-    omids <- spacify <$> many (parseDelim Middle)
+    omids <- pipes
     let omid | null omids = mempty
              | otherwise = omids <> B.space
-    inlines <- p
-    cmids <- spacify <$> many (parseDelim Middle)
+    return $ openDelim <> omid
+
+pipes :: PandocMonad m => MdocParser m Inlines
+pipes = spacify <$> many (parseDelim Middle)
+
+closingDelimiters :: PandocMonad m => MdocParser m Inlines
+closingDelimiters = do
+    cmids <- pipes
     let cmid | null cmids = mempty
              | otherwise = B.space <> cmids
     closeDelim <- mconcat <$> many (parseDelim Close)
-    return (openDelim <> omid, inlines, cmid <> closeDelim)
+    return $ cmid <> closeDelim
+
+delimitedArgs :: PandocMonad m => MdocParser m x -> MdocParser m (Inlines, x, Inlines)
+delimitedArgs p = do
+    openDelim <- openingDelimiters
+    inlines <- p
+    closeDelim <- closingDelimiters
+    return (openDelim, inlines, closeDelim)
 
 -- TODO extract further?
 simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines
@@ -409,6 +425,33 @@ parseXr = do
         s <- lit <?> "Xr manual section"
         return (toString n, toString s)
 
+parseLk :: PandocMonad m => MdocParser m Inlines
+parseLk = do
+  macro "Lk"
+  openClose <- closingDelimiters
+  openOpen <- openingDelimiters
+  url <- toString <$> lit
+  inner <- spacify <$> many segment
+  close <- closingDelimiters
+  let label | null inner = B.str url
+            | otherwise = inner
+  return $ open openClose openOpen <> B.link url "" label <> close
+  where
+    open a b
+      | null a = b
+      | null b = a
+      | otherwise = a <> B.space <> b
+    end = msatisfy newControlContext
+    segment = do
+      a <- openingDelimiters
+      m <- option mempty litsToText
+      z <-
+        try (closingDelimiters <* notFollowedBy end)
+          <|> option mempty pipes
+      guard $ not $ all null [a, m, z]
+      return $ a <> m <> z
+
+
 parseNs :: PandocMonad m => MdocParser m Inlines
 parseNs = macro "Ns" >> return noSpace
 
@@ -420,6 +463,7 @@ parseInlineMacro =
   choice
     [ parseSy,
       parseEm,
+      parseLk,
       parseNo,
       parseNm,
       parseXr,
diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index be154635aa54..85d47cce45c9 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -57,7 +57,7 @@ tests = [
         ".Bo Po hi Pc Bc" =?>
         para ("[(hi)]")
     ]
-  , testGroup "inlines"
+  , testGroup "simple inlines"
     [ "Sy" =:
         ".Sy hello world" =?>
         para (strong "hello world")
@@ -71,6 +71,17 @@ tests = [
         ".Sy hello Em world" =?>
         para (strong "hello" <> space <> emph "world")
     ]
+  , testGroup "links"
+    [ "basic" =:
+        ".Lk href name" =?>
+        para (link "href" "" "name")
+    , "complicated" =:
+        ".Lk , ( href name )" =?>
+        para ("," <> space <> "(" <> link "href" "" "name" <> ")")
+    , "unnamed" =:
+        ".Lk href" =?>
+        para (link "href" "" "href")
+    ]
   , testGroup "Ns macro"
     [ "at the beginning of a macro line (mandoc delta)" =:
         T.unlines [".Op before", ".Ns Op after"] =?>

From ecbb1dbab3dce6a6c64c8f0339a8513541f0ad10 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Mon, 16 Sep 2024 21:36:14 -0700
Subject: [PATCH 55/81] Add Mt macro

---
 src/Text/Pandoc/Readers/Mdoc.hs | 9 +++++++++
 test/Tests/Readers/Mdoc.hs      | 4 ++++
 2 files changed, 13 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index e3107b31ac31..82b759ec2645 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -344,6 +344,14 @@ parseEm = simpleInline "Em" (eliminateEmpty B.emph)
 parseNo :: PandocMonad m => MdocParser m Inlines
 parseNo = simpleInline "No" (eliminateEmpty id)
 
+-- I'm not sure why mandoc inserts a ~ when Mt is missing an argument,
+-- but it does, and it doesn't issue a warning, so that quirk is
+-- retained.
+parseMt :: PandocMonad m => MdocParser m Inlines
+parseMt = simpleInline "Mt" mailto
+  where mailto x | null x = B.link ("mailto:~") "" "~"
+                 | otherwise = B.link ("mailto:" <> stringify x) "" x
+
 parseQl :: PandocMonad m => MdocParser m Inlines
 parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify
 
@@ -464,6 +472,7 @@ parseInlineMacro =
     [ parseSy,
       parseEm,
       parseLk,
+      parseMt,
       parseNo,
       parseNm,
       parseXr,
diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index 85d47cce45c9..b9f0454096fc 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -64,6 +64,10 @@ tests = [
     , "Em" =:
         ".Em hello world" =?>
         para (emph "hello world")
+    , "Mt" =:
+        ".Mt a@example.org , b@example.org" =?>
+        para ((link "mailto:a@example.org" "" "a@example.org") <>
+               "," <> space <> (link "mailto:b@example.org" "" "b@example.org"))
     , "delimiters" =:
         ".Sy ( hello world )" =?>
         para (mconcat ["(", strong "hello world", ")"])

From 309e864a23784a67c9b9e08a9803e6ddb1a5bb58 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Mon, 16 Sep 2024 21:37:05 -0700
Subject: [PATCH 56/81] Add .No test

---
 test/Tests/Readers/Mdoc.hs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index b9f0454096fc..f2e6a152b232 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -68,6 +68,9 @@ tests = [
         ".Mt a@example.org , b@example.org" =?>
         para ((link "mailto:a@example.org" "" "a@example.org") <>
                "," <> space <> (link "mailto:b@example.org" "" "b@example.org"))
+    , "No" =:
+        ".No ( hello , world ! )" =?>
+        para "(hello, world!)"
     , "delimiters" =:
         ".Sy ( hello world )" =?>
         para (mconcat ["(", strong "hello world", ")"])

From 2f287508675ac2b9f50675885efcba9039f3bc95 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Wed, 18 Sep 2024 18:03:04 -0700
Subject: [PATCH 57/81] Add Pa (path) macro

---
 src/Text/Pandoc/Readers/Mdoc.hs | 6 ++++++
 test/Tests/Readers/Mdoc.hs      | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 82b759ec2645..a1068015a075 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -352,6 +352,11 @@ parseMt = simpleInline "Mt" mailto
   where mailto x | null x = B.link ("mailto:~") "" "~"
                  | otherwise = B.link ("mailto:" <> stringify x) "" x
 
+parsePa :: PandocMonad m => MdocParser m Inlines
+parsePa = simpleInline "Pa" p
+  where p x | null x = B.spanWith (cls "Pa") "~"
+            | otherwise = B.spanWith (cls "Pa") x
+
 parseQl :: PandocMonad m => MdocParser m Inlines
 parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify
 
@@ -473,6 +478,7 @@ parseInlineMacro =
       parseEm,
       parseLk,
       parseMt,
+      parsePa,
       parseNo,
       parseNm,
       parseXr,
diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index f2e6a152b232..9f79d3f2e123 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -71,6 +71,9 @@ tests = [
     , "No" =:
         ".No ( hello , world ! )" =?>
         para "(hello, world!)"
+    , "empty Pa with closing punctuation" =:
+        ".Pa ) z" =?>
+        para (spanWith (mempty, ["Pa"], mempty) "~" <> ")" <> space <> spanWith (mempty, ["Pa"], mempty) "z")
     , "delimiters" =:
         ".Sy ( hello world )" =?>
         para (mconcat ["(", strong "hello world", ")"])

From f921ba82dd44bbcf5eada60fa58790cdca98370c Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 19 Sep 2024 16:13:16 -0700
Subject: [PATCH 58/81] Add Fl, Ev, Cm and tests

---
 src/Text/Pandoc/Readers/Mdoc.hs | 46 +++++++++++++++++++++++++++------
 test/Tests/Readers/Mdoc.hs      | 25 +++++++++++++++++-
 2 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index a1068015a075..be0092c34e9a 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -227,14 +227,19 @@ parseDelim pos = do
   (Delim _ txt _) <- delim pos
   return $ B.str txt
 
-litsToText :: PandocMonad m => MdocParser m Inlines
+litsToText :: PandocMonad m => MdocParser m [T.Text]
 litsToText = do
+  ls <- many1 lit
+  return $ map toString ls
+
+litsToInlines :: PandocMonad m => MdocParser m Inlines
+litsToInlines = do
   ls <- many1 lit
   let strs = map (B.str . toString) ls
   return $ spacify strs
 
-litsAndDelimsToText :: PandocMonad m => MdocParser m Inlines
-litsAndDelimsToText = do
+litsAndDelimsToInlines :: PandocMonad m => MdocParser m Inlines
+litsAndDelimsToInlines = do
   (o, ls, c) <- delimitedArgs $ many lit
   guard $ not (null o && null ls && null c)
   let strs = map (B.str . toString) ls
@@ -272,6 +277,16 @@ simpleInline nm xform = do
   macro nm
   segs <- manyTill segment inlineContextEnd
   return $ spacify segs
+ where
+   segment = do
+      (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToInlines
+      return $ openDelim <> xform inlines <> closeDelim
+
+argsInline :: PandocMonad m => T.Text -> ([T.Text] -> Inlines) -> MdocParser m Inlines
+argsInline nm xform = do
+  macro nm
+  segs <- manyTill segment inlineContextEnd
+  return $ spacify segs
  where
    segment = do
       (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText
@@ -283,8 +298,8 @@ lineEnclosure nm xform = do
   --- XXX wtf
   (first, further, finally) <- delimitedArgs
     (manyTill
-      (parseInlineMacro <|> (try (litsAndDelimsToText <* notFollowedBy eol))
-        <|> litsToText) (lookAhead (many (macro "Ns" <|> delim Close) *> eol)))
+      (parseInlineMacro <|> (try (litsAndDelimsToInlines <* notFollowedBy eol))
+        <|> litsToInlines) (lookAhead (many (macro "Ns" <|> delim Close) *> eol)))
   return $ first <> xform (spacify further) <> finally
 
 noSpace :: Inlines
@@ -344,6 +359,9 @@ parseEm = simpleInline "Em" (eliminateEmpty B.emph)
 parseNo :: PandocMonad m => MdocParser m Inlines
 parseNo = simpleInline "No" (eliminateEmpty id)
 
+parseEv :: PandocMonad m => MdocParser m Inlines
+parseEv = simpleInline "Ev" (eliminateEmpty (B.codeWith (cls "Ev") . stringify))
+
 -- I'm not sure why mandoc inserts a ~ when Mt is missing an argument,
 -- but it does, and it doesn't issue a warning, so that quirk is
 -- retained.
@@ -357,6 +375,14 @@ parsePa = simpleInline "Pa" p
   where p x | null x = B.spanWith (cls "Pa") "~"
             | otherwise = B.spanWith (cls "Pa") x
 
+parseFl :: PandocMonad m => MdocParser m Inlines
+parseFl = argsInline "Fl" (spacify . fl . flags)
+  where fl = map $ B.codeWith (cls "Fl")
+        flags = map ("-" <>)
+
+parseCm :: PandocMonad m => MdocParser m Inlines
+parseCm = simpleInline "Cm" $ B.codeWith (cls "Cm") . stringify
+
 parseQl :: PandocMonad m => MdocParser m Inlines
 parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify
 
@@ -416,7 +442,7 @@ parseNm = do
   mnm <- (progName <$> getState)
   case mnm of
     Nothing -> do
-      (_, nm, _) <- lookAhead $ delimitedArgs $ option mempty litsToText
+      (_, nm, _) <- lookAhead $ delimitedArgs $ option mempty litsToInlines
       guard $ not (null nm)
       simpleInline "Nm" ok
     Just nm -> simpleInline "Nm" $ \x ->
@@ -438,6 +464,7 @@ parseXr = do
         s <- lit <?> "Xr manual section"
         return (toString n, toString s)
 
+-- TODO: can I rewrite this with argsInline?
 parseLk :: PandocMonad m => MdocParser m Inlines
 parseLk = do
   macro "Lk"
@@ -457,7 +484,7 @@ parseLk = do
     end = msatisfy newControlContext
     segment = do
       a <- openingDelimiters
-      m <- option mempty litsToText
+      m <- option mempty litsToInlines
       z <-
         try (closingDelimiters <* notFollowedBy end)
           <|> option mempty pipes
@@ -477,8 +504,11 @@ parseInlineMacro =
     [ parseSy,
       parseEm,
       parseLk,
+      parseEv,
       parseMt,
       parsePa,
+      parseFl,
+      parseCm,
       parseNo,
       parseNm,
       parseXr,
@@ -504,7 +534,7 @@ parseInlineMacro =
 
 parseInline :: PandocMonad m => MdocParser m Inlines
 parseInline = parseStr  <|>
-  ((parseInlineMacro <|> litsAndDelimsToText) <* optional eol)
+  ((parseInlineMacro <|> litsAndDelimsToInlines) <* optional eol)
 
 
 -- TODO probably need some kind of fold to deal with Ns
diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index 9f79d3f2e123..537952bbff4c 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -30,6 +30,9 @@ infix 4 =:
      => String -> (Text, c) -> TestTree
 (=:) = test mdoc
 
+cls :: Text -> Attr
+cls x = (mempty, [x], mempty)
+
 tests :: [TestTree]
 tests = [
   testGroup "one-line enclosures"
@@ -39,6 +42,9 @@ tests = [
     , "Sq" =:
         ".Sq hello world" =?>
         para (singleQuoted "hello world")
+    , "Ev" =:
+        ".Ev HELLO_WORLD ," =?>
+        para (codeWith (cls "Ev") "HELLO_WORLD" <> ",")
     , "empty" =:
         ".Dq" =?>
         para (doubleQuoted mempty)
@@ -73,7 +79,7 @@ tests = [
         para "(hello, world!)"
     , "empty Pa with closing punctuation" =:
         ".Pa ) z" =?>
-        para (spanWith (mempty, ["Pa"], mempty) "~" <> ")" <> space <> spanWith (mempty, ["Pa"], mempty) "z")
+        para (spanWith (cls "Pa") "~" <> ")" <> space <> spanWith (mempty, ["Pa"], mempty) "z")
     , "delimiters" =:
         ".Sy ( hello world )" =?>
         para (mconcat ["(", strong "hello world", ")"])
@@ -81,6 +87,23 @@ tests = [
         ".Sy hello Em world" =?>
         para (strong "hello" <> space <> emph "world")
     ]
+  , testGroup "Fl"
+    [ "simple" =:
+        ".Fl w" =?>
+        para (codeWith (cls "Fl") "-w")
+    , "multiple" =:
+        ".Fl W all" =?>
+        para (codeWith (cls "Fl") "-W" <> space <> codeWith (cls "Fl") "-all")
+    , "GNU" =:
+        ".Fl -help" =?>
+        para (codeWith (cls "Fl") "--help")
+    , "GNU escaped" =:
+        ".Fl \\-help" =?>
+        para (codeWith (cls "Fl") "--help")
+    , "punctuation" =:
+        ".Op Fl a | b" =?>
+        para ("[" <> codeWith (cls "Fl") "-a" <> " | " <> codeWith (cls "Fl") "-b" <> "]")
+    ]
   , testGroup "links"
     [ "basic" =:
         ".Lk href name" =?>

From 79c34c948fe8e76a5ec160c1f55139a19fd3f262 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 19 Sep 2024 16:17:17 -0700
Subject: [PATCH 59/81] fixup Ev: Move test to the right section

---
 test/Tests/Readers/Mdoc.hs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index 537952bbff4c..1c7f6a9dde39 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -42,9 +42,6 @@ tests = [
     , "Sq" =:
         ".Sq hello world" =?>
         para (singleQuoted "hello world")
-    , "Ev" =:
-        ".Ev HELLO_WORLD ," =?>
-        para (codeWith (cls "Ev") "HELLO_WORLD" <> ",")
     , "empty" =:
         ".Dq" =?>
         para (doubleQuoted mempty)
@@ -70,6 +67,9 @@ tests = [
     , "Em" =:
         ".Em hello world" =?>
         para (emph "hello world")
+    , "Ev" =:
+        ".Ev HELLO_WORLD ," =?>
+        para (codeWith (cls "Ev") "HELLO_WORLD" <> ",")
     , "Mt" =:
         ".Mt a@example.org , b@example.org" =?>
         para ((link "mailto:a@example.org" "" "a@example.org") <>

From 86af56880ca4274467095d26662dc52830f6f2ef Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Fri, 20 Sep 2024 12:02:42 -0700
Subject: [PATCH 60/81] Refine Fl parser

There's a number of unique-looking cases for Fl parsing so I am just
handling them very explicitly instead of trying to generalize anything
enough to handle it.
---
 src/Text/Pandoc/Readers/Mdoc.hs | 40 +++++++++++++++++++++++----------
 test/Tests/Readers/Mdoc.hs      | 18 +++++++++++++++
 2 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index be0092c34e9a..9d89785f1b22 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -14,6 +14,7 @@ Conversion of mdoc to 'Pandoc' document.
 module Text.Pandoc.Readers.Mdoc (readMdoc) where
 
 import Data.Default (Default)
+import Data.Functor (($>))
 import Control.Monad (mplus, guard, void, when)
 import Control.Monad.Except (throwError)
 import Data.List (intersperse)
@@ -110,6 +111,11 @@ macro name = msatisfy t where
   t (Macro n _) = n == name
   t _ = False
 
+anyMacro :: PandocMonad m => MdocParser m MdocToken
+anyMacro = msatisfy t where
+  t (Macro _ _) = True
+  t _ = False
+
 emptyMacro :: PandocMonad m => T.Text -> MdocParser m MdocToken
 emptyMacro n = macro n <* eol
 
@@ -282,15 +288,6 @@ simpleInline nm xform = do
       (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToInlines
       return $ openDelim <> xform inlines <> closeDelim
 
-argsInline :: PandocMonad m => T.Text -> ([T.Text] -> Inlines) -> MdocParser m Inlines
-argsInline nm xform = do
-  macro nm
-  segs <- manyTill segment inlineContextEnd
-  return $ spacify segs
- where
-   segment = do
-      (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText
-      return $ openDelim <> xform inlines <> closeDelim
 
 lineEnclosure :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines
 lineEnclosure nm xform = do
@@ -376,9 +373,28 @@ parsePa = simpleInline "Pa" p
             | otherwise = B.spanWith (cls "Pa") x
 
 parseFl :: PandocMonad m => MdocParser m Inlines
-parseFl = argsInline "Fl" (spacify . fl . flags)
-  where fl = map $ B.codeWith (cls "Fl")
-        flags = map ("-" <>)
+parseFl = do
+  macro "Fl"
+  start <- option mempty (emptyWithDelim <|> emptyWithMacro <|> emptyEmpty)
+  segs <- manyTill segment inlineContextEnd
+  return $ spacify ([start] <> segs)
+ where
+   emptyWithDelim = do
+     lookAhead $ many1 (delim Middle <|> delim Close)
+     ds <- closingDelimiters
+     return $ fl "-" <> ds
+   emptyWithMacro = do
+     lookAhead anyMacro
+     rest <- parseInlines
+     return $ fl "-" <> rest
+   emptyEmpty = lookAhead eol $> fl "-"
+   segment = do
+      (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText
+      return $ openDelim <> (spacify . (map fl) . flags) inlines <> closeDelim
+   fl = B.codeWith (cls "Fl")
+   flags [] = ["-"]
+   flags xs = map ("-" <>) xs
+
 
 parseCm :: PandocMonad m => MdocParser m Inlines
 parseCm = simpleInline "Cm" $ B.codeWith (cls "Cm") . stringify
diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index 1c7f6a9dde39..a35f2fd9ca13 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -94,6 +94,12 @@ tests = [
     , "multiple" =:
         ".Fl W all" =?>
         para (codeWith (cls "Fl") "-W" <> space <> codeWith (cls "Fl") "-all")
+    , "empty with following macro" =:
+        ".Fl Cm x" =?>
+        para (codeWith (cls "Fl") "-" <> codeWith (cls "Cm") "x")
+    , "following Ns" =:
+        ".Fl W Ns Cm all" =?>
+        para (codeWith (cls "Fl") "-W" <> codeWith (cls "Cm") "all")
     , "GNU" =:
         ".Fl -help" =?>
         para (codeWith (cls "Fl") "--help")
@@ -103,6 +109,18 @@ tests = [
     , "punctuation" =:
         ".Op Fl a | b" =?>
         para ("[" <> codeWith (cls "Fl") "-a" <> " | " <> codeWith (cls "Fl") "-b" <> "]")
+    , "middle close paren" =:
+        ".Fl a ) z" =?>
+        para (codeWith (cls "Fl") "-a" <> ") " <> codeWith (cls "Fl") "-z")
+    , "empty with close paren" =:
+        ".Fl ) z" =?>
+        para (codeWith (cls "Fl") "-" <> ") " <> codeWith (cls "Fl") "-z")
+    , "empty with pipe" =:
+        ".Fl | z" =?>
+        para (codeWith (cls "Fl") "-" <> " | " <> codeWith (cls "Fl") "-z")
+    , "empty with parens" =:
+        ".Fl ( )" =?>
+        para ("(" <> codeWith (cls "Fl") "-" <> ")")
     ]
   , testGroup "links"
     [ "basic" =:

From 5a2196e1c17fa1e242b9c9d133754ed3bdf01941 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Fri, 20 Sep 2024 14:39:20 -0700
Subject: [PATCH 61/81] Extract helper for inline macros that produce Code

---
 src/Text/Pandoc/Readers/Mdoc.hs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 9d89785f1b22..c32be9b05da2 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -288,6 +288,9 @@ simpleInline nm xform = do
       (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToInlines
       return $ openDelim <> xform inlines <> closeDelim
 
+codeLikeInline :: PandocMonad m => T.Text -> MdocParser m Inlines
+codeLikeInline nm = do
+  simpleInline nm (eliminateEmpty (B.codeWith (cls nm) . stringify))
 
 lineEnclosure :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines
 lineEnclosure nm xform = do
@@ -357,7 +360,7 @@ parseNo :: PandocMonad m => MdocParser m Inlines
 parseNo = simpleInline "No" (eliminateEmpty id)
 
 parseEv :: PandocMonad m => MdocParser m Inlines
-parseEv = simpleInline "Ev" (eliminateEmpty (B.codeWith (cls "Ev") . stringify))
+parseEv = codeLikeInline "Ev"
 
 -- I'm not sure why mandoc inserts a ~ when Mt is missing an argument,
 -- but it does, and it doesn't issue a warning, so that quirk is
@@ -397,7 +400,7 @@ parseFl = do
 
 
 parseCm :: PandocMonad m => MdocParser m Inlines
-parseCm = simpleInline "Cm" $ B.codeWith (cls "Cm") . stringify
+parseCm = codeLikeInline "Cm"
 
 parseQl :: PandocMonad m => MdocParser m Inlines
 parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify

From 162f48d77930baaf30a7be693f568f6ac0c93beb Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Fri, 20 Sep 2024 14:41:00 -0700
Subject: [PATCH 62/81] Zap useless comments

---
 src/Text/Pandoc/Readers/Mdoc.hs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index c32be9b05da2..987361252bec 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -347,7 +347,6 @@ eliminateEmpty x y = if null y then mempty else x y
 cls :: T.Text -> B.Attr
 cls x = (mempty, [x], mempty)
 
--- Sy: callable, parsed, >0 arguments
 -- mandoc -T html formats Sy with a <b> tag, since it's not really
 -- semantically <strong>, but Strong is our best option in Pandoc
 parseSy :: PandocMonad m => MdocParser m Inlines
@@ -470,7 +469,6 @@ parseNm = do
          else ok x
   where ok = B.codeWith (cls "Nm") . stringify
 
--- Xr
 parseXr :: PandocMonad m => MdocParser m Inlines
 parseXr = do
   macro "Xr"

From 2cf842d04624e4d7f0306dade7ad7ab62bbbb385 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Fri, 20 Sep 2024 14:45:36 -0700
Subject: [PATCH 63/81] Remove useless do

---
 src/Text/Pandoc/Readers/Mdoc.hs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 987361252bec..9b6447b08e8a 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -289,8 +289,7 @@ simpleInline nm xform = do
       return $ openDelim <> xform inlines <> closeDelim
 
 codeLikeInline :: PandocMonad m => T.Text -> MdocParser m Inlines
-codeLikeInline nm = do
-  simpleInline nm (eliminateEmpty (B.codeWith (cls nm) . stringify))
+codeLikeInline nm = simpleInline nm (eliminateEmpty (B.codeWith (cls nm) . stringify))
 
 lineEnclosure :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines
 lineEnclosure nm xform = do

From 91e17306292eb2a15605407d42c3a83a99ba44b8 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Fri, 20 Sep 2024 15:24:57 -0700
Subject: [PATCH 64/81] Add .Fl Fl support

---
 src/Text/Pandoc/Readers/Mdoc.hs | 7 ++++++-
 test/Tests/Readers/Mdoc.hs      | 3 +++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 9b6447b08e8a..bb08e410fdb1 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -376,7 +376,7 @@ parsePa = simpleInline "Pa" p
 parseFl :: PandocMonad m => MdocParser m Inlines
 parseFl = do
   macro "Fl"
-  start <- option mempty (emptyWithDelim <|> emptyWithMacro <|> emptyEmpty)
+  start <- option mempty (emptyWithDelim <|> flfl <|> emptyWithMacro <|> emptyEmpty)
   segs <- manyTill segment inlineContextEnd
   return $ spacify ([start] <> segs)
  where
@@ -384,6 +384,11 @@ parseFl = do
      lookAhead $ many1 (delim Middle <|> delim Close)
      ds <- closingDelimiters
      return $ fl "-" <> ds
+   flfl = do
+     lookAhead (macro "Fl")
+     x:xs <- B.toList <$> parseFl
+     let xx = B.codeWith (cls "Fl") $ "-" <> stringify x
+     return $ xx <> B.fromList xs
    emptyWithMacro = do
      lookAhead anyMacro
      rest <- parseInlines
diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index a35f2fd9ca13..5d007e05dacd 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -106,6 +106,9 @@ tests = [
     , "GNU escaped" =:
         ".Fl \\-help" =?>
         para (codeWith (cls "Fl") "--help")
+    , "GNU Fl Fl" =:
+        ".Fl Fl help" =?>
+        para (codeWith (cls "Fl") "--help")
     , "punctuation" =:
         ".Op Fl a | b" =?>
         para ("[" <> codeWith (cls "Fl") "-a" <> " | " <> codeWith (cls "Fl") "-b" <> "]")

From 2a1688bb6c34e9c9a2c78a73c74f0a79221adbf2 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Fri, 20 Sep 2024 15:25:50 -0700
Subject: [PATCH 65/81] Add Ar macro

---
 src/Text/Pandoc/Readers/Mdoc.hs | 6 ++++++
 test/Tests/Readers/Mdoc.hs      | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index bb08e410fdb1..5a3166d47230 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -401,6 +401,11 @@ parseFl = do
    flags [] = ["-"]
    flags xs = map ("-" <>) xs
 
+parseAr :: PandocMonad m => MdocParser m Inlines
+parseAr = simpleInline "Ar" ar
+  where ar x | null x = B.codeWith (cls "variable") "file ..."
+             | otherwise = B.codeWith (cls "variable") $ stringify x
+
 
 parseCm :: PandocMonad m => MdocParser m Inlines
 parseCm = codeLikeInline "Cm"
@@ -530,6 +535,7 @@ parseInlineMacro =
       parsePa,
       parseFl,
       parseCm,
+      parseAr,
       parseNo,
       parseNm,
       parseXr,
diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index 5d007e05dacd..e1bc78ef001b 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -70,6 +70,9 @@ tests = [
     , "Ev" =:
         ".Ev HELLO_WORLD ," =?>
         para (codeWith (cls "Ev") "HELLO_WORLD" <> ",")
+    , "Ar" =:
+        ".Ar ) z" =?>
+        para (codeWith (cls "variable") "file ..." <> ") " <> codeWith (cls "variable") "z")
     , "Mt" =:
         ".Mt a@example.org , b@example.org" =?>
         para ((link "mailto:a@example.org" "" "a@example.org") <>

From c9d1530cd6d43bb562d847574fcb766551898dbe Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Fri, 20 Sep 2024 16:09:01 -0700
Subject: [PATCH 66/81] Add more easy macros

---
 src/Text/Pandoc/Readers/Mdoc.hs | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 5a3166d47230..a3e785efaab5 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -291,6 +291,9 @@ simpleInline nm xform = do
 codeLikeInline :: PandocMonad m => T.Text -> MdocParser m Inlines
 codeLikeInline nm = simpleInline nm (eliminateEmpty (B.codeWith (cls nm) . stringify))
 
+spanLikeInline :: PandocMonad m => T.Text -> MdocParser m Inlines
+spanLikeInline nm = simpleInline nm (eliminateEmpty (B.spanWith (cls nm)))
+
 lineEnclosure :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m Inlines
 lineEnclosure nm xform = do
   macro nm
@@ -360,6 +363,12 @@ parseNo = simpleInline "No" (eliminateEmpty id)
 parseEv :: PandocMonad m => MdocParser m Inlines
 parseEv = codeLikeInline "Ev"
 
+parseAd :: PandocMonad m => MdocParser m Inlines
+parseAd = spanLikeInline "Ad"
+
+parseMs :: PandocMonad m => MdocParser m Inlines
+parseMs = spanLikeInline "Ms"
+
 -- I'm not sure why mandoc inserts a ~ when Mt is missing an argument,
 -- but it does, and it doesn't issue a warning, so that quirk is
 -- retained.
@@ -410,6 +419,9 @@ parseAr = simpleInline "Ar" ar
 parseCm :: PandocMonad m => MdocParser m Inlines
 parseCm = codeLikeInline "Cm"
 
+parseIc :: PandocMonad m => MdocParser m Inlines
+parseIc = codeLikeInline "Ic"
+
 parseQl :: PandocMonad m => MdocParser m Inlines
 parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify
 
@@ -535,6 +547,9 @@ parseInlineMacro =
       parsePa,
       parseFl,
       parseCm,
+      parseIc,
+      parseAd,
+      parseMs,
       parseAr,
       parseNo,
       parseNm,

From 9b63a48e6db99015ee115974e94a960517cb0f65 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Fri, 20 Sep 2024 16:09:30 -0700
Subject: [PATCH 67/81] Zap comment alluding to an abandoned helper

---
 src/Text/Pandoc/Readers/Mdoc.hs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index a3e785efaab5..b884e85bfd31 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -502,7 +502,6 @@ parseXr = do
         s <- lit <?> "Xr manual section"
         return (toString n, toString s)
 
--- TODO: can I rewrite this with argsInline?
 parseLk :: PandocMonad m => MdocParser m Inlines
 parseLk = do
   macro "Lk"

From 8fe57d37832088bff854ecff5db64725d6c7ed68 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Fri, 20 Sep 2024 17:09:23 -0700
Subject: [PATCH 68/81] Add In macro (non-SYNOPSIS version)

---
 src/Text/Pandoc/Readers/Mdoc.hs | 15 +++++++++++++++
 test/Tests/Readers/Mdoc.hs      |  3 +++
 2 files changed, 18 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index b884e85bfd31..10ef60f9f848 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -502,6 +502,20 @@ parseXr = do
         s <- lit <?> "Xr manual section"
         return (toString n, toString s)
 
+parseIn :: PandocMonad m => MdocParser m Inlines
+parseIn = do
+  macro "In"
+  openClose <- closingDelimiters
+  openOpen <- openingDelimiters
+  header <- toString <$> lit
+  close <- closingDelimiters
+  return $ open openClose openOpen <> B.codeWith (cls "In") ("<" <> header <> ">") <> close
+  where
+    open a b
+      | null a = b
+      | null b = a
+      | otherwise = a <> B.space <> b
+
 parseLk :: PandocMonad m => MdocParser m Inlines
 parseLk = do
   macro "Lk"
@@ -550,6 +564,7 @@ parseInlineMacro =
       parseAd,
       parseMs,
       parseAr,
+      parseIn,
       parseNo,
       parseNm,
       parseXr,
diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index e1bc78ef001b..896b56ae7be3 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -73,6 +73,9 @@ tests = [
     , "Ar" =:
         ".Ar ) z" =?>
         para (codeWith (cls "variable") "file ..." <> ") " <> codeWith (cls "variable") "z")
+    , "In" =:
+        ".In ( math.h ) b c" =?>
+        para ("(" <> codeWith (cls "In") "<math.h>" <> ") b c")
     , "Mt" =:
         ".Mt a@example.org , b@example.org" =?>
         para ((link "mailto:a@example.org" "" "a@example.org") <>

From dbf31cb30a69a8c49d399e2067483cd9d546485c Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Tue, 24 Sep 2024 14:31:32 -0700
Subject: [PATCH 69/81] Ignore .Ns at start of line in mdoc lexer

Solves a delta with mandoc
---
 src/Text/Pandoc/Readers/Mdoc/Lex.hs | 7 ++++++-
 test/Tests/Readers/Mdoc.hs          | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc/Lex.hs b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
index 3cf741101a02..a54e0915154c 100644
--- a/src/Text/Pandoc/Readers/Mdoc/Lex.hs
+++ b/src/Text/Pandoc/Readers/Mdoc/Lex.hs
@@ -208,9 +208,14 @@ lexControlLine = do
   guard $ sourceColumn pos == 1
   char '.'
   m@(Macro name _) <- lexMacro
+  -- .Ns macros at the start of a line are ignored. We'd have to look behind
+  -- to keep track of the "start of the line" in the parser, so we'll drop
+  -- those macros in lexing.
+  let start | name == "Ns" = []
+            | otherwise = [m]
   let parsed = isParsedMacro name
   (wds, e) <- manyUntil (l parsed) eofline
-  return $ MdocTokens $ Seq.fromList $ (m:wds) <> [e]
+  return $ MdocTokens $ Seq.fromList $ start <> wds <> [e]
     where
       l True = try lexDelim <|> try lexCallableMacro <|> lexLit
       l False = try lexDelim <|> lexLit
diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index 896b56ae7be3..c836869895f6 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -145,7 +145,7 @@ tests = [
   , testGroup "Ns macro"
     [ "at the beginning of a macro line (mandoc delta)" =:
         T.unlines [".Op before", ".Ns Op after"] =?>
-        para "[before][after]"  -- mandoc: warning + "[before] [after]"
+        para "[before] [after]"  -- mandoc: warning
     , "after a block closing macro" =:
         T.unlines [".Oo before", ".Oc Ns Op after"] =?>
         para "[before][after]"

From b8b7411d61112372d84bcef5a063a6ece03badc6 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Tue, 24 Sep 2024 14:35:49 -0700
Subject: [PATCH 70/81] Add Ap macro

the edge case of "Ap (" tested in this mandoc regress isn't present in
any actual OpenBSD base system manuals, where Ap is only ever followed
by a letter. Furthermore, "Ap" is generally uncommon compared to "Ns '"
(e.g. ".Xr mandoc 1 Ns 's"). I'm accepting a difference from mandoc here
because correctly suppressing space after the "(" here would require
more refactoring than I feel like doing at time of writing.
---
 src/Text/Pandoc/Readers/Mdoc.hs | 16 +++++++++++++++-
 test/Tests/Readers/Mdoc.hs      | 16 ++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 10ef60f9f848..7851da051a67 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -307,6 +307,9 @@ lineEnclosure nm xform = do
 noSpace :: Inlines
 noSpace = B.rawInline "mdoc" "Ns"
 
+apMacro :: Inlines
+apMacro = B.rawInline "mdoc" "Ap"
+
 data SpacifyState = SpacifyState { accum :: [Inlines], prev :: Inlines, ns :: Bool }
 instance Default SpacifyState where def = SpacifyState [] mempty False
 
@@ -316,8 +319,9 @@ foldNoSpaces xs = (finalize . foldl go def) xs
     go :: SpacifyState -> Inlines -> SpacifyState
     go s x
       | ns s && x == noSpace = s
-      | ns s                 = s{prev = prev s <> x, ns = False}
+      |         x == apMacro = s{prev = prev s <> "'", ns = True}
       |         x == noSpace = s{ns = True}
+      | ns s                 = s{prev = prev s <> x, ns = False}
       | null (prev s)        = s{prev = x}
       | otherwise            = s{accum = accum s <> [prev s], prev = x}
     finalize s
@@ -546,6 +550,15 @@ parseLk = do
 parseNs :: PandocMonad m => MdocParser m Inlines
 parseNs = macro "Ns" >> return noSpace
 
+parseAp :: PandocMonad m => MdocParser m Inlines
+parseAp = macro "Ap" >> return apMacro
+
+-- parseAp :: PandocMonad m => MdocParser m Inlines
+-- parseAp = do
+--   macro "Ap"
+--   return $ B.singleton apMacro
+
+
 -- TODO should possibly rename this function b/c some of these are
 -- Mdoc block partial-implicit macros. Unclear if this distinction
 -- is going to be relevant.
@@ -585,6 +598,7 @@ parseInlineMacro =
       parseBro,
       parseAo,
       parseOo,
+      parseAp,
       parseNs
     ]
 
diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index c836869895f6..65f7ae88b4cd 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -174,6 +174,22 @@ tests = [
         ".No no \"Ns\" ns \")\" No no" =?>
         para ("nons)" <> space <> "no")
     ]
+  , testGroup "Ap macro"
+    [ "in the middle of a macro line" =:
+        ".Xr mandoc 1 Ap s" =?>
+        para (spanWith (cls "Xr") "mandoc(1)" <> "'s")
+    -- mandoc difference: the edge case of "Ap (" tested in this mandoc regress
+    -- isn't present in any actual OpenBSD base system manuals, where Ap is
+    -- only ever followed by a letter. Furthermore, "Ap" is generally uncommon
+    -- compared to "Ns '" (e.g. ".Xr mandoc 1 Ns 's"). I'm accepting a
+    -- difference from mandoc here because correctly suppressing space after
+    -- the "(" here would require more refactoring than I feel like doing at
+    -- time of writing.
+    -- per mandoc, should be: para (strong "bold" <> "'(" <> strong "bold")
+    , "with punctuation and called macro" =:
+        ".Sy bold Ap ( \"Sy\" bold" =?>
+        para (strong "bold" <> "'( " <> strong "bold")
+    ]
   , testGroup "inline punctuation"
     [ testGroup "leading punctuation"
       [ "open paren"           =: ".Em ( b"     =?> para ("(" <> emph "b")

From f7d07bebe948e19f2a8e693fe5aa0fd478c86696 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Tue, 24 Sep 2024 15:29:37 -0700
Subject: [PATCH 71/81] Add a couple more easy codelike macros

---
 src/Text/Pandoc/Readers/Mdoc.hs | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 7851da051a67..b4985090cdfb 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -426,6 +426,12 @@ parseCm = codeLikeInline "Cm"
 parseIc :: PandocMonad m => MdocParser m Inlines
 parseIc = codeLikeInline "Ic"
 
+parseEr :: PandocMonad m => MdocParser m Inlines
+parseEr = codeLikeInline "Er"
+
+parseCd :: PandocMonad m => MdocParser m Inlines
+parseCd = codeLikeInline "Cd"
+
 parseQl :: PandocMonad m => MdocParser m Inlines
 parseQl = lineEnclosure "Ql" $ B.codeWith (cls "Ql") . stringify
 
@@ -574,6 +580,8 @@ parseInlineMacro =
       parseFl,
       parseCm,
       parseIc,
+      parseEr,
+      parseCd,
       parseAd,
       parseMs,
       parseAr,

From 3464d9cba65790084b902aad89b73ce66969a3b9 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Wed, 25 Sep 2024 14:30:34 -0700
Subject: [PATCH 72/81] tktktk: Sm support

---
 src/Text/Pandoc/Readers/Mdoc.hs | 123 ++++++++++++++++++++++----------
 test/Tests/Readers/Mdoc.hs      |  17 +++++
 2 files changed, 102 insertions(+), 38 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index b4985090cdfb..181c4b850c8c 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -1,5 +1,6 @@
 {-# LANGUAGE FlexibleContexts  #-}
 {-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE LambdaCase #-}
 {- |
    Module      : Text.Pandoc.Readers.Mdoc
    Copyright   : 
@@ -43,21 +44,28 @@ data MdocSection
   | ShOther
   deriving (Show, Eq)
 
-data ManState = ManState { readerOptions   :: ReaderOptions
-                         , metadata        :: Meta
-                         , tableCellsPlain :: Bool
-                         , progName :: Maybe T.Text
-                         , currentSection :: MdocSection
-                         } deriving Show
-
-instance Default ManState where
-  def = ManState { readerOptions   = def
-                 , metadata        = B.nullMeta
-                 , tableCellsPlain = True
-                 , currentSection = ShOther
-                 , progName = Nothing }
-
-type MdocParser m = P.ParsecT [MdocToken] ManState m
+data MdocState = MdocState
+    { readerOptions :: ReaderOptions
+    , metadata :: Meta
+    , tableCellsPlain :: Bool
+    , spacingMode :: Bool
+    , progName :: Maybe T.Text
+    , currentSection :: MdocSection
+    }
+    deriving (Show)
+
+instance Default MdocState where
+    def =
+        MdocState
+            { readerOptions = def
+            , metadata = B.nullMeta
+            , tableCellsPlain = True
+            , spacingMode = True
+            , currentSection = ShOther
+            , progName = Nothing
+            }
+
+type MdocParser m = P.ParsecT [MdocToken] MdocState m
 
 
 -- | Read mdoc from an input string and return a Pandoc document.
@@ -68,15 +76,15 @@ readMdoc :: (PandocMonad m, ToSources a)
 readMdoc opts s = do
   let Sources inps = toSources s
   tokenz <- mconcat <$> mapM (uncurry lexMdoc) inps
-  let state = def {readerOptions = opts} :: ManState
+  let state = def {readerOptions = opts} :: MdocState
   eitherdoc <- readWithMTokens parseMdoc state
      (Foldable.toList . unRoffTokens $ tokenz)
   either (throwError . fromParsecError (Sources inps)) return eitherdoc
 
 
 readWithMTokens :: PandocMonad m
-        => ParsecT [MdocToken] ManState m a  -- ^ parser
-        -> ManState                         -- ^ initial state
+        => ParsecT [MdocToken] MdocState m a  -- ^ parser
+        -> MdocState                         -- ^ initial state
         -> [MdocToken]                       -- ^ input
         -> m (Either ParseError a)
 readWithMTokens parser state input =
@@ -172,7 +180,7 @@ argsToInlines :: PandocMonad m => MdocParser m Inlines
 argsToInlines = do
   ls <- manyTill arg eol
   let strs = map (B.str . toString) ls
-  return $ spacify strs
+  spacify strs
 
 parsePrologue :: PandocMonad m => MdocParser m ()
 parsePrologue = do
@@ -228,6 +236,15 @@ parseStr = do
   (Str txt _) <- str
   return $ B.str txt
 
+-- Multiple consecutive strs within a block always need to get spaces between
+-- them and then packed up together, because text lines are never affected by
+-- the spacing mode. XXX but apparently this isn't actually true for mandoc,
+-- so I'm not sure what's correct here yet.
+parseStrs :: PandocMonad m => MdocParser m Inlines
+parseStrs = do
+  txt <- many1 parseStr
+  return $ mconcat $ intersperse B.space txt
+
 parseDelim :: PandocMonad m => DelimSide -> MdocParser m Inlines
 parseDelim pos = do
   (Delim _ txt _) <- delim pos
@@ -242,14 +259,14 @@ litsToInlines :: PandocMonad m => MdocParser m Inlines
 litsToInlines = do
   ls <- many1 lit
   let strs = map (B.str . toString) ls
-  return $ spacify strs
+  spacify strs
 
 litsAndDelimsToInlines :: PandocMonad m => MdocParser m Inlines
 litsAndDelimsToInlines = do
   (o, ls, c) <- delimitedArgs $ many lit
   guard $ not (null o && null ls && null c)
-  let strs = map (B.str . toString) ls
-  return $ o <> spacify strs <> c
+  strs <- spacify $ map (B.str . toString) ls
+  return $ o <> strs <> c
 
 openingDelimiters :: PandocMonad m => MdocParser m Inlines
 openingDelimiters = do
@@ -260,7 +277,7 @@ openingDelimiters = do
     return $ openDelim <> omid
 
 pipes :: PandocMonad m => MdocParser m Inlines
-pipes = spacify <$> many (parseDelim Middle)
+pipes = many (parseDelim Middle) >>= spacify
 
 closingDelimiters :: PandocMonad m => MdocParser m Inlines
 closingDelimiters = do
@@ -282,7 +299,7 @@ simpleInline :: PandocMonad m => T.Text -> (Inlines -> Inlines) -> MdocParser m
 simpleInline nm xform = do
   macro nm
   segs <- manyTill segment inlineContextEnd
-  return $ spacify segs
+  spacify segs
  where
    segment = do
       (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToInlines
@@ -302,7 +319,8 @@ lineEnclosure nm xform = do
     (manyTill
       (parseInlineMacro <|> (try (litsAndDelimsToInlines <* notFollowedBy eol))
         <|> litsToInlines) (lookAhead (many (macro "Ns" <|> delim Close) *> eol)))
-  return $ first <> xform (spacify further) <> finally
+  further' <- spacify further
+  return $ first <> xform further' <> finally
 
 noSpace :: Inlines
 noSpace = B.rawInline "mdoc" "Ns"
@@ -310,8 +328,14 @@ noSpace = B.rawInline "mdoc" "Ns"
 apMacro :: Inlines
 apMacro = B.rawInline "mdoc" "Ap"
 
-data SpacifyState = SpacifyState { accum :: [Inlines], prev :: Inlines, ns :: Bool }
-instance Default SpacifyState where def = SpacifyState [] mempty False
+smOff :: Inlines
+smOff = B.rawInline "mdoc" "Sm off"
+
+smOn :: Inlines
+smOn = B.rawInline "mdoc" "Sm on"
+
+data SpacifyState = SpacifyState { accum :: [Inlines], prev :: Inlines, ns :: Bool, sm :: Bool}
+instance Default SpacifyState where def = SpacifyState [] mempty False True
 
 foldNoSpaces :: [Inlines] -> [Inlines]
 foldNoSpaces xs = (finalize . foldl go def) xs
@@ -321,15 +345,23 @@ foldNoSpaces xs = (finalize . foldl go def) xs
       | ns s && x == noSpace = s
       |         x == apMacro = s{prev = prev s <> "'", ns = True}
       |         x == noSpace = s{ns = True}
+      |         x == smOn    = s{sm = True}
+      | sm s && x == smOff   = s{accum = accum s <> [prev s], prev = mempty, sm = False}
       | ns s                 = s{prev = prev s <> x, ns = False}
+      | not (sm s)           = s{prev = prev s <> x}
       | null (prev s)        = s{prev = x}
       | otherwise            = s{accum = accum s <> [prev s], prev = x}
     finalize s
       | null (prev s) = accum s
       | otherwise     = accum s <> [prev s]
 
-spacify :: [Inlines] -> Inlines
-spacify = mconcat . intersperse B.space . foldNoSpaces
+spacify :: PandocMonad m => [Inlines] -> MdocParser m Inlines
+spacify x = do
+  mode <- spacingMode <$> getState
+  return (go mode x)
+  where
+    go True = mconcat . intersperse B.space . foldNoSpaces
+    go False = mconcat . foldNoSpaces
 
 {- Compatibility note: mandoc permits, and doesn't warn on, "vertical" macros
  (Pp, Bl/El, Bd/Ed) inside of "horizontal" block partial-explicit quotations
@@ -345,7 +377,8 @@ multilineEnclosure op cl xform = do
   (macro cl <?> show cl)
   closeDelim <- mconcat <$> many (parseDelim Close)
   optional eol
-  return $ openDelim <> xform (spacify contents) <> closeDelim
+  contents' <- spacify contents
+  return $ openDelim <> xform contents' <> closeDelim
 
 eliminateEmpty :: (Inlines -> Inlines) -> Inlines -> Inlines
 eliminateEmpty x y = if null y then mempty else x y
@@ -391,7 +424,7 @@ parseFl = do
   macro "Fl"
   start <- option mempty (emptyWithDelim <|> flfl <|> emptyWithMacro <|> emptyEmpty)
   segs <- manyTill segment inlineContextEnd
-  return $ spacify ([start] <> segs)
+  spacify ([start] <> segs)
  where
    emptyWithDelim = do
      lookAhead $ many1 (delim Middle <|> delim Close)
@@ -409,7 +442,8 @@ parseFl = do
    emptyEmpty = lookAhead eol $> fl "-"
    segment = do
       (openDelim, inlines, closeDelim) <- delimitedArgs $ option mempty litsToText
-      return $ openDelim <> (spacify . (map fl) . flags) inlines <> closeDelim
+      inner <- (spacify . (map fl) . flags) inlines
+      return $ openDelim <> inner <> closeDelim
    fl = B.codeWith (cls "Fl")
    flags [] = ["-"]
    flags xs = map ("-" <>) xs
@@ -532,7 +566,7 @@ parseLk = do
   openClose <- closingDelimiters
   openOpen <- openingDelimiters
   url <- toString <$> lit
-  inner <- spacify <$> many segment
+  inner <- many segment >>= spacify
   close <- closingDelimiters
   let label | null inner = B.str url
             | otherwise = inner
@@ -611,13 +645,12 @@ parseInlineMacro =
     ]
 
 parseInline :: PandocMonad m => MdocParser m Inlines
-parseInline = parseStr  <|>
-  ((parseInlineMacro <|> litsAndDelimsToInlines) <* optional eol)
-
+parseInline = parseStrs <|> (controlLine >>= spacify)
+  where
+    controlLine = many1 ((parseInlineMacro <|> litsAndDelimsToInlines) <* optional eol)
 
--- TODO probably need some kind of fold to deal with Ns
 parseInlines :: PandocMonad m => MdocParser m Inlines
-parseInlines = spacify <$> many1 parseInline
+parseInlines = many1 (parseSmToggle <|> parseInline) >>= spacify
 
 parsePara :: PandocMonad m => MdocParser m Blocks
 parsePara = do
@@ -643,6 +676,20 @@ parseCodeBlock = do
 skipBlanks :: PandocMonad m => MdocParser m Blocks
 skipBlanks = many1 blank *> mempty
 
+parseSmToggle :: PandocMonad m => MdocParser m Inlines
+parseSmToggle = do
+  macro "Sm"
+  cur <- spacingMode <$> getState
+  mode <- optionMaybe (literal "on" $> True <|> literal "off" $> False)
+  eol
+  let newMode = update mode cur
+  modifyState $ \s -> s{spacingMode = newMode}
+  return $ if newMode then smOn else smOff
+  where
+    update = \case
+      Nothing -> not
+      Just x -> const x
+
 parseBlock :: PandocMonad m => MdocParser m Blocks
 parseBlock = choice [ -- parseList
                     -- , parseDefinitionList
diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index 65f7ae88b4cd..b3de25a09ee8 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -174,6 +174,23 @@ tests = [
         ".No no \"Ns\" ns \")\" No no" =?>
         para ("nons)" <> space <> "no")
     ]
+  , testGroup "spacing mode"
+    [ "all text" =:
+        T.unlines ["a", ".Sm off", "b c", "d", ".Sm on", "e"] =?>
+        para ("a " <> str "b c" <>  " d e")
+    , "text around macro" =:
+        T.unlines ["a", ".Sm off", ".Sy b c", ".Sm on", "d"] =?>
+        para ("a" <> space <> strong "bc" <> space <> "d")
+    , "mulitple macros" =:
+        T.unlines ["a", ".Sm off", ".Sy b Em c", ".Sm on", "d"] =?>
+        para ("a" <> space <> strong "b" <> emph "c" <> space <> "d")
+    , "mulitple control lines" =:
+        T.unlines ["a", ".Sm off", ".Sy b", ".Em c", ".Sm on", "d"] =?>
+        para ("a" <> space <> strong "b" <> emph "c" <> space <> "d")
+    , "mixed control and text lines" =:
+        T.unlines ["a", ".Sm off", ".Sy b", "c", ".Em d", ".Sm on", "d"] =?>
+        para ("a" <> space <> strong "b" <> "c" <> emph "d" <> space <> "d")
+    ]
   , testGroup "Ap macro"
     [ "in the middle of a macro line" =:
         ".Xr mandoc 1 Ap s" =?>

From 3b74c82e52abb1cac525716f60d71d9c230c68f8 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 26 Sep 2024 14:36:56 -0700
Subject: [PATCH 73/81] Abandon whitespace-preservation in text lines

It ends up with bad results in the ANSI writer, for example, because it
then can't break lines at Spaces. This isn't wholly inconsistent with
mandoc, because it makes no effort to render multiple consecutive spaces
from the source document in HTML.
---
 src/Text/Pandoc/Readers/Mdoc.hs | 5 +----
 test/Tests/Readers/Mdoc.hs      | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 181c4b850c8c..b60f510434f2 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -228,13 +228,10 @@ parseSynopsisSection = do
   guard $ sec == ShSynopsis
   return mempty
 
--- parseStr doesn't use B.text because roff(7) specifies that
--- whitespace in text lines is treated literally.
--- XXX is this what we actually want?
 parseStr :: PandocMonad m => MdocParser m Inlines
 parseStr = do
   (Str txt _) <- str
-  return $ B.str txt
+  return $ B.text txt
 
 -- Multiple consecutive strs within a block always need to get spaces between
 -- them and then packed up together, because text lines are never affected by
diff --git a/test/Tests/Readers/Mdoc.hs b/test/Tests/Readers/Mdoc.hs
index b3de25a09ee8..77fc529c90f0 100644
--- a/test/Tests/Readers/Mdoc.hs
+++ b/test/Tests/Readers/Mdoc.hs
@@ -177,7 +177,7 @@ tests = [
   , testGroup "spacing mode"
     [ "all text" =:
         T.unlines ["a", ".Sm off", "b c", "d", ".Sm on", "e"] =?>
-        para ("a " <> str "b c" <>  " d e")
+        para ("a b c d e")
     , "text around macro" =:
         T.unlines ["a", ".Sm off", ".Sy b c", ".Sm on", "d"] =?>
         para ("a" <> space <> strong "bc" <> space <> "d")

From 681e1d3f5a4538fff21756e120e4bbb32cba7638 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 26 Sep 2024 15:26:40 -0700
Subject: [PATCH 74/81] Skip to the end of the SYNOPSIS for now

---
 src/Text/Pandoc/Readers/Mdoc.hs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index b60f510434f2..24774eec9f60 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -226,6 +226,8 @@ parseSynopsisSection :: PandocMonad m => MdocParser m Blocks
 parseSynopsisSection = do
   sec <- currentSection <$> getState
   guard $ sec == ShSynopsis
+  -- TODO actually implement this
+  manyTill (anyToken) (lookAhead (macro "Sh"))
   return mempty
 
 parseStr :: PandocMonad m => MdocParser m Inlines

From 9d956b8c3122debfb3d641fa66af1ba3f53ccded Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 26 Sep 2024 15:27:45 -0700
Subject: [PATCH 75/81] Reformat

---
 src/Text/Pandoc/Readers/Mdoc.hs | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 24774eec9f60..34a1d377982e 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -333,8 +333,15 @@ smOff = B.rawInline "mdoc" "Sm off"
 smOn :: Inlines
 smOn = B.rawInline "mdoc" "Sm on"
 
-data SpacifyState = SpacifyState { accum :: [Inlines], prev :: Inlines, ns :: Bool, sm :: Bool}
-instance Default SpacifyState where def = SpacifyState [] mempty False True
+data SpacifyState = SpacifyState
+  { accum :: [Inlines],
+    prev :: Inlines,
+    ns :: Bool,
+    sm :: Bool
+  }
+
+instance Default SpacifyState where
+  def = SpacifyState [] mempty False True
 
 foldNoSpaces :: [Inlines] -> [Inlines]
 foldNoSpaces xs = (finalize . foldl go def) xs

From 7e0f712b3dfe7f26ae991d3406d45660e49ceb6d Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 26 Sep 2024 15:28:19 -0700
Subject: [PATCH 76/81] Fix .Nm parsing and insertion of manual name

---
 src/Text/Pandoc/Readers/Mdoc.hs | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 34a1d377982e..dc166247329b 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -528,17 +528,17 @@ parseAo = multilineEnclosure "Ao" "Ac" $ \x -> "⟨" <> x <> "⟩"
 
 parseNm :: PandocMonad m => MdocParser m Inlines
 parseNm = do
+  macro "Nm"
   mnm <- (progName <$> getState)
-  case mnm of
-    Nothing -> do
-      (_, nm, _) <- lookAhead $ delimitedArgs $ option mempty litsToInlines
-      guard $ not (null nm)
-      simpleInline "Nm" ok
-    Just nm -> simpleInline "Nm" $ \x ->
-      if null x
-         then B.codeWith (cls "Nm") nm
-         else ok x
-  where ok = B.codeWith (cls "Nm") . stringify
+  (op, rg, cl) <- delimitedArgs $ option mempty litsToInlines
+  return $ case (mnm, rg) of
+    (Just nm, x) | null x ->
+      op <> ok nm <> cl
+    (_, x) ->
+      op <> (ok . stringify) x <> cl
+  where
+    ok = B.codeWith (cls "Nm")
+
 
 parseXr :: PandocMonad m => MdocParser m Inlines
 parseXr = do

From 638d84a7453d91bdd791134ea76486827b72a067 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 26 Sep 2024 15:32:28 -0700
Subject: [PATCH 77/81] Skip unknown macros in block context

Getting to the point where I can start working with real manual pages so
this is helpful.
---
 src/Text/Pandoc/Readers/Mdoc.hs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index dc166247329b..63aad7df69f7 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -696,6 +696,9 @@ parseSmToggle = do
       Nothing -> not
       Just x -> const x
 
+skipUnknownMacro :: PandocMonad m => MdocParser m Blocks
+skipUnknownMacro = anyMacro *> manyTill anyToken eol $> mempty
+
 parseBlock :: PandocMonad m => MdocParser m Blocks
 parseBlock = choice [ -- parseList
                     -- , parseDefinitionList
@@ -708,6 +711,6 @@ parseBlock = choice [ -- parseList
                     , skipBlanks
                     -- , parseBlockQuote
                     -- , parseNewParagraph
-                    -- , skipUnknownMacro
+                    , skipUnknownMacro
                     ]
 

From 6ed129f91693a9018e9b115387ffb871b817b327 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 26 Sep 2024 16:47:32 -0700
Subject: [PATCH 78/81] Parse display blocks and 1-line literals

A bit janky but worse things have happened.
---
 src/Text/Pandoc/Readers/Mdoc.hs | 69 ++++++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 14 deletions(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 63aad7df69f7..6fe6224f2980 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -526,6 +526,12 @@ parseAq = lineEnclosure "Aq" $ \x -> "⟨" <> x <> "⟩"
 parseAo :: PandocMonad m => MdocParser m Inlines
 parseAo = multilineEnclosure "Ao" "Ac" $ \x -> "⟨" <> x <> "⟩"
 
+parseDl :: PandocMonad m => MdocParser m Blocks
+parseDl = do
+  inner <- lineEnclosure "Dl" id
+  eol
+  return $ B.codeBlock (stringify inner)
+
 parseNm :: PandocMonad m => MdocParser m Inlines
 parseNm = do
   macro "Nm"
@@ -648,20 +654,45 @@ parseInlineMacro =
       parseOo,
       parseAp,
       parseNs
-    ]
+    ] <?> "inline macro"
 
 parseInline :: PandocMonad m => MdocParser m Inlines
-parseInline = parseStrs <|> (controlLine >>= spacify)
+parseInline = parseStrs <|> (controlLine >>= spacify) <?> "text lines or inline macros"
   where
     controlLine = many1 ((parseInlineMacro <|> litsAndDelimsToInlines) <* optional eol)
 
 parseInlines :: PandocMonad m => MdocParser m Inlines
 parseInlines = many1 (parseSmToggle <|> parseInline) >>= spacify
 
+-- Lp is a deprecated synonym for Pp
 parsePara :: PandocMonad m => MdocParser m Blocks
-parsePara = do
-  optional (emptyMacro "Pp" <|> emptyMacro "Lp")  -- Lp: deprecated synonym for Pp
-  B.para . B.trimInlines <$> parseInlines
+parsePara = B.para . B.trimInlines <$> parseInlines <*
+    optional (emptyMacro "Pp" <|> emptyMacro "Lp")
+
+-- Indented display blocks are visually similar to block quotes
+-- but rarely carry those semantics. I'm just putting things in
+-- divs. Centered is discouraged and rarely seen.
+parseDisplay :: PandocMonad m => MdocParser m Blocks
+parseDisplay = do
+  literal "-filled" <|> literal "-ragged" <|> literal "-centered"
+  optional (literal "-offset" *> lit)
+  optional (literal "-compact")
+  eol
+  B.divWith (cls "display") . mconcat <$> many parseRegularBlock
+
+-- This is something of a best-effort interpretation of the -unfilled
+-- display block type. The main difference with mandoc is probably
+-- that newlines inside of multiline enclosures won't be preserved.
+parseUnfilled :: PandocMonad m => MdocParser m Blocks
+parseUnfilled = do
+  literal "-unfilled"
+  optional (literal "-offset" *> lit)
+  optional (literal "-compact")
+  eol
+  lns <- many (parseStr <|> (blank $> mempty) <|> (controlLine >>= spacify))
+  return $ B.lineBlock lns
+  where
+    controlLine = many1 ((parseInlineMacro <|> litsAndDelimsToInlines)) <* optional eol
 
 -- CodeBlocks can't contain any other markup, but mdoc
 -- still interprets control lines within .Bd -literal
@@ -669,15 +700,20 @@ parsePara = do
 -- we get any control lines inside a Bd literal
 parseCodeBlock :: PandocMonad m => MdocParser m Blocks
 parseCodeBlock = do
-  macro "Bd" -- TODO will need to hoist
   literal "-literal"
   optional (literal "-offset" *> lit)
   optional (literal "-compact")
   eol
   l <- T.unlines . map toString <$> many (str <|> blank)
-  emptyMacro "Ed"
   return $ B.codeBlock l
 
+parseBd :: PandocMonad m => MdocParser m Blocks
+parseBd = do
+  macro "Bd"
+  blk <- parseCodeBlock <|> parseDisplay <|> parseUnfilled
+  emptyMacro "Ed"
+  return blk
+
 
 skipBlanks :: PandocMonad m => MdocParser m Blocks
 skipBlanks = many1 blank *> mempty
@@ -699,18 +735,23 @@ parseSmToggle = do
 skipUnknownMacro :: PandocMonad m => MdocParser m Blocks
 skipUnknownMacro = anyMacro *> manyTill anyToken eol $> mempty
 
+parseRegularBlock :: PandocMonad m => MdocParser m Blocks
+parseRegularBlock =
+  choice
+    [ parseDl
+    , parsePara
+    , emptyMacro "Pp" *> mempty
+    , parseBd
+    , skipBlanks
+    , skipUnknownMacro
+    ]
+
 parseBlock :: PandocMonad m => MdocParser m Blocks
 parseBlock = choice [ -- parseList
                     -- , parseDefinitionList
                     parseHeader
                     , parseNameSection
                     , parseSynopsisSection
-                    , parsePara
-                    -- , parseTable
-                    , parseCodeBlock
-                    , skipBlanks
-                    -- , parseBlockQuote
-                    -- , parseNewParagraph
-                    , skipUnknownMacro
+                    , parseRegularBlock
                     ]
 

From 2d94ad6b2dfbbbfab760d298cb471ccf642d04a3 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 26 Sep 2024 21:25:09 -0700
Subject: [PATCH 79/81] Move skipUnknownMacro to outer blocks

Was having probalos
---
 src/Text/Pandoc/Readers/Mdoc.hs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 6fe6224f2980..7efa4f0f440a 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -743,7 +743,6 @@ parseRegularBlock =
     , emptyMacro "Pp" *> mempty
     , parseBd
     , skipBlanks
-    , skipUnknownMacro
     ]
 
 parseBlock :: PandocMonad m => MdocParser m Blocks
@@ -753,5 +752,6 @@ parseBlock = choice [ -- parseList
                     , parseNameSection
                     , parseSynopsisSection
                     , parseRegularBlock
+                    , skipUnknownMacro
                     ]
 

From 2589dbbb323b8c5e6c743ca9ca1e14db54c25c04 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 26 Sep 2024 21:25:47 -0700
Subject: [PATCH 80/81] Add Dv macro

---
 src/Text/Pandoc/Readers/Mdoc.hs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index 7efa4f0f440a..c10e5e3f51c0 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -406,6 +406,9 @@ parseNo = simpleInline "No" (eliminateEmpty id)
 parseEv :: PandocMonad m => MdocParser m Inlines
 parseEv = codeLikeInline "Ev"
 
+parseDv :: PandocMonad m => MdocParser m Inlines
+parseDv = codeLikeInline "Dv"
+
 parseAd :: PandocMonad m => MdocParser m Inlines
 parseAd = spanLikeInline "Ad"
 
@@ -621,6 +624,7 @@ parseInlineMacro =
       parseEm,
       parseLk,
       parseEv,
+      parseDv,
       parseMt,
       parsePa,
       parseFl,

From 2147b2f6528b5df130d9a27d15cbe9334b4d3a49 Mon Sep 17 00:00:00 2001
From: Evan Silberman <evan@jklol.net>
Date: Thu, 26 Sep 2024 21:27:20 -0700
Subject: [PATCH 81/81] Add bullet, ordered, and definition lists

---
 src/Text/Pandoc/Readers/Mdoc.hs | 40 +++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/src/Text/Pandoc/Readers/Mdoc.hs b/src/Text/Pandoc/Readers/Mdoc.hs
index c10e5e3f51c0..6643ea588116 100644
--- a/src/Text/Pandoc/Readers/Mdoc.hs
+++ b/src/Text/Pandoc/Readers/Mdoc.hs
@@ -718,6 +718,45 @@ parseBd = do
   emptyMacro "Ed"
   return blk
 
+parseItemList :: PandocMonad m => MdocParser m Blocks
+parseItemList = do
+  f <- (choice (map literal ["-bullet", "-dash", "-hyphen", "-item"]) $> B.bulletList)
+       <|> literal "-enum" $> B.orderedList
+  optional (literal "-width" *> lit)
+  optional (literal "-offset" *> lit)
+  optional (literal "-compact")
+  eol
+  items <- many bulletItem
+  return $ f items
+  where
+    bulletItem = do
+      emptyMacro "It"
+      mconcat <$> many parseRegularBlock
+
+parseDefinitionList :: PandocMonad m => MdocParser m Blocks
+parseDefinitionList = do
+  choice $ map literal ["-hang", "-inset", "-ohang", "-tag"]
+  optional (literal "-width" *> lit)
+  optional (literal "-offset" *> lit)
+  optional (literal "-compact")
+  eol
+  items <- many dlItem
+  return $ B.definitionList items
+  where
+    dlItem = do
+      macro "It"
+      dt <- listHead >>= spacify
+      dd <- mconcat <$> many parseRegularBlock
+      return (dt, [dd])
+    -- TODO support Xo/Xc
+    listHead = many1 ((parseInlineMacro <|> litsAndDelimsToInlines)) <* optional eol
+
+parseBl :: PandocMonad m => MdocParser m Blocks
+parseBl = do
+  macro "Bl"
+  blk <- parseItemList <|> parseDefinitionList
+  emptyMacro "El"
+  return blk
 
 skipBlanks :: PandocMonad m => MdocParser m Blocks
 skipBlanks = many1 blank *> mempty
@@ -746,6 +785,7 @@ parseRegularBlock =
     , parsePara
     , emptyMacro "Pp" *> mempty
     , parseBd
+    , parseBl
     , skipBlanks
     ]