head commitHEAD master

author: Daniel Schadt <kingdread@gmx.de> 2023-12-02 11:52:18 +0100
committer: Daniel Schadt <kingdread@gmx.de> 2023-12-02 11:52:18 +0100
commit: 430c17097cba8c970c2826be0e622b6cb7bb6818 (patch)
tree: 33f63a17c359816b176aa364c130c32257e8cc4b /src
download: MiniScalp-430c17097cba8c970c2826be0e622b6cb7bb6818.tar.gz
MiniScalp-430c17097cba8c970c2826be0e622b6cb7bb6818.tar.bz2
MiniScalp-430c17097cba8c970c2826be0e622b6cb7bb6818.zip
5 files changed, 312 insertions, 0 deletions
diff --git a/src/MiniScalp.hs b/src/MiniScalp.hs
new file mode 100644
index 0000000..f0e3e11
--- /dev/null
+++ b/src/MiniScalp.hs
@@ -0,0 +1,56 @@
+-- |
+-- Module      : MiniScalp
+-- Description : Wrapper around zenacy-html for web scraping
+-- Copyright   : (c) Daniel Schadt, 2023
+-- License     : MIT
+-- Maintainer  : sample@email.com
+-- Stability   : experimental
+-- Portability : POSIX
+--
+-- MiniScalp is a wrapper around [@zenacy-html@](https://hackage.haskell.org/package/zenacy-html) to do web scraping.
+--
+-- As @zenacy-html@ implements HTML parsing according to the WHATWG standard, this should produce results equivalent to
+-- what your browser produces.
+--
+-- = DOM navigation
+--
+-- The interface of MiniScalp is similar to that of [@scalpel@](https://hackage.haskell.org/package/scalpel) in the
+-- sense that you are provided with a monadic interface to the DOM. The bread and butter function is
+-- 'MiniScalp.Query.chroot', which allows you to focus on a specific subpart of the DOM by using
+-- 'MiniScalp.Types.Predicate' functions.
+--
+-- A number of helpful predicates are defined in "MiniScalp.Predicates".
+--
+-- = Example
+--
+-- > import Control.Monad
+-- > import Data.Maybe
+-- > import Data.Text
+-- > import MiniScalp.Predicates
+-- > import MiniScalp.Query
+-- > import MiniScalp.Sources
+-- > import MiniScalp.Types
+-- >
+-- > data MensaLine = MensaLine Text [Text] deriving (Show)
+-- >
+-- > mensaScraper :: Scraper [MensaLine]
+-- > mensaScraper = chroots (tag "tr" @& hasClass "mensatype_rows") $ do
+-- >   name <- chroot ("td" @: [hasClass "mensatype"]) text'
+-- >   meals <- chroots ("td" @: [hasClass "menu-title"]) text'
+-- >   return $ MensaLine name meals
+-- >
+-- > main :: IO ()
+-- > main = do
+-- >   scraped <- fromJust <$> scrapeFile "mensa.html" mensaScraper
+-- >   forM_ scraped $ \(MensaLine name meals) -> do
+-- >     putStrLn $ unpack name
+-- >     forM_ meals $ \meal -> putStrLn ("  " ++ unpack meal)
+-- >     putStrLn ""
+--
+-- = Modules
+--
+-- * "MiniScalp.Types": Basic definitions of the needed types.
+-- * "MiniScalp.Query": Data extraction routines.
+-- * "MiniScalp.Predicates": Predicates to select the wanted nodes.
+-- * "MiniScalp.Sources": Various data sources.
+module MiniScalp () where
diff --git a/src/MiniScalp/Predicates.hs b/src/MiniScalp/Predicates.hs
new file mode 100644
index 0000000..bcf0a11
--- /dev/null
+++ b/src/MiniScalp/Predicates.hs
@@ -0,0 +1,72 @@
+-- | Various predicates to match HTML nodes.
+module MiniScalp.Predicates
+  ( tag,
+    (@&),
+    (@|),
+    (@=),
+    (@/),
+    (@:),
+    hasClass,
+  )
+where
+
+import Data.List (tails)
+import Data.Text (Text)
+import MiniScalp.Types
+import Zenacy.HTML
+
+-- | Matches if both predicates match.
+--
+-- > tag "tr" @& hasClass "menu-title"
+(@&) :: Predicate -> Predicate -> Predicate
+a @& b = \s n -> a s n && b s n
+
+infixl 8 @&
+
+-- | Matches if one of the predicates matches.
+--
+-- > tag "thead" @| tag "tbody"
+(@|) :: Predicate -> Predicate -> Predicate
+a @| b = \s n -> a s n || b s n
+
+infixl 7 @|
+
+-- | Matches if the current node has the given tag.
+--
+-- > tag "p"
+tag :: Text -> Predicate
+tag t _ = htmlElemHasName t
+
+-- | Matches if the current node has the given attribute and value.
+--
+-- > "id" @= "description"
+(@=) :: Text -> Text -> Predicate
+k @= v = \_ n -> htmlElemHasAttrVal k v n
+
+infix 9 @=
+
+-- | Matches if the node has the given class.
+--
+-- > hasClass "src"
+hasClass :: Text -> Predicate
+hasClass c _ = htmlElemClassesContains c
+
+-- | Matches if the right predicate matches the current node, and the left operand matches a predecessor node.
+--
+-- > tag "p" @/ tag "img"
+(@/) :: Predicate -> Predicate -> Predicate
+a @/ b = \s n -> b s n && or (zipWith a (drop 1 $ tails s) s)
+
+infixl 1 @/
+
+-- | Shorthand to find a specific tag with the given predicates.
+--
+-- The following two are equivalent:
+--
+-- > "p" @: [hasClass "text", "id" @= "description"]
+--
+-- and
+--
+-- > tag "p" @& hasClass "text" @& "id" @= "description"
+(@:) :: Text -> [Predicate] -> Predicate
+t @: a = tag t @& \s n -> all (\p -> p s n) a
diff --git a/src/MiniScalp/Query.hs b/src/MiniScalp/Query.hs
new file mode 100644
index 0000000..07784fb
--- /dev/null
+++ b/src/MiniScalp/Query.hs
@@ -0,0 +1,91 @@
+-- | Data retrieval functions.
+module MiniScalp.Query
+  ( -- * Simple accessors
+    node,
+    parents,
+    text,
+    text',
+    attribute,
+    html,
+
+    -- * Complex navigation
+    retrieve,
+    chroots,
+    chroot,
+  )
+where
+
+import Control.Applicative (empty, optional)
+import Control.Monad (forM)
+import Control.Monad.Reader (asks, local)
+import Data.Maybe (catMaybes)
+import Data.Text (Text)
+import Data.Text qualified as T
+import MiniScalp.Types
+import Zenacy.HTML
+
+-- | Retrieves the current node.
+node :: (Monad m) => ScraperT m HTMLNode
+node = asks snd
+
+-- | Retrieves the parents of the current node.
+--
+-- Node that the first entry is the immediate parent.
+parents :: (Monad m) => ScraperT m [HTMLNode]
+parents = asks fst
+
+-- | Retrieves the text of the current node.
+text :: (Monad m) => ScraperT m Text
+text = node >>= \n -> maybe empty return $ htmlElemText n
+
+-- | Recursively retrieves the text of the current node and all children nodes.
+text' :: (Monad m) => ScraperT m Text
+text' = recurseText <$> node
+  where
+    recurseText :: HTMLNode -> Text
+    recurseText (HTMLDocument _ c) = T.concat $ map recurseText c
+    recurseText (HTMLDoctype {}) = mempty
+    recurseText (HTMLFragment _ c) = T.concat $ map recurseText c
+    recurseText (HTMLElement _ _ _ c) = T.concat $ map recurseText c
+    recurseText (HTMLTemplate {}) = mempty
+    recurseText (HTMLText t) = t
+    recurseText (HTMLComment _) = mempty
+
+-- | Retrieves the value of the attribute with the given name.
+--
+-- Fails if the attribute does not exist.
+attribute :: (Monad m) => Text -> ScraperT m Text
+attribute a = node >>= \n -> maybe empty return $ htmlElemGetAttr a n
+
+-- | Retrieves the rendered HTML of the current node.
+--
+-- Note that this may not correspond to the original source, as it is re-rendered from the DOM.
+html :: (Monad m) => ScraperT m Text
+html = htmlRender <$> node
+
+-- | Retrieves all child contexts for which the given predicate matches.
+retrieve :: (Monad m) => Predicate -> ScraperT m [ScrapeContext]
+retrieve predicate = do
+  n <- node
+  ps <- parents
+  let includeRoot = predicate ps n
+  children <- concat <$> forM (htmlNodeContent n) (\child -> local (const (n : ps, child)) (retrieve predicate))
+  return $ if includeRoot then (ps, n) : children else children
+
+-- | Finds the elements according to the predicate and then executes the given scraper in their contexts.
+--
+-- If a subscraper fails, it is silently skipped.
+chroots :: (Monad m) => Predicate -> ScraperT m a -> ScraperT m [a]
+chroots p s = do
+  es <- retrieve p
+  catMaybes <$> forM es (\ctx -> local (const ctx) $ optional s)
+
+-- | Like 'chroots', but only executes the scraper in the first context.
+--
+-- If no matching elements are found, this scraper fails.
+chroot :: (Monad m) => Predicate -> ScraperT m a -> ScraperT m a
+chroot p s = do
+  cs <- chroots p s
+  case cs of
+    a : _ -> return a
+    [] -> empty
diff --git a/src/MiniScalp/Sources.hs b/src/MiniScalp/Sources.hs
new file mode 100644
index 0000000..959b41a
--- /dev/null
+++ b/src/MiniScalp/Sources.hs
@@ -0,0 +1,32 @@
+-- | Various entry points for scrapers using different data sources.
+module MiniScalp.Sources
+  ( -- * Scraping in-memory text
+  scrapeTextT,
+  scrapeText,
+  -- * Scraping local files
+  scrapeFileT,
+  scrapeFile,
+  )
+where
+
+import Data.Functor ((<&>))
+import Data.Text (Text)
+import Data.Text.IO qualified as T
+import MiniScalp.Types
+import Zenacy.HTML (htmlParseEasy)
+
+-- | Parse and scrape the given 'Text'.
+scrapeTextT :: Text -> ScraperT m a -> m (Maybe a)
+scrapeTextT text scraper = runScraperT scraper $ htmlParseEasy text
+
+-- | Specialised version of 'scrapeTextT'.
+scrapeText :: Text -> Scraper a -> Maybe a
+scrapeText text scraper = runScraper scraper $ htmlParseEasy text
+
+-- | Read the file from the given path and scrape it.
+scrapeFileT :: FilePath -> ScraperT m a -> IO (m (Maybe a))
+scrapeFileT path scraper = T.readFile path <&> flip scrapeTextT scraper
+
+-- | Specialised version of 'scrapeFileT'.
+scrapeFile :: FilePath -> Scraper a -> IO (Maybe a)
+scrapeFile path scraper = T.readFile path <&> flip scrapeText scraper
diff --git a/src/MiniScalp/Types.hs b/src/MiniScalp/Types.hs
new file mode 100644
index 0000000..cb6e366
--- /dev/null
+++ b/src/MiniScalp/Types.hs
@@ -0,0 +1,61 @@
+-- |
+-- Type definitions for MiniScalp.
+--
+-- The main type is a 'ScraperT', which provides the monadic interface to the scraper. It is a monad transformer, so you
+-- can use it in combinations with other monads in your scrapers. If you do not need any other monads, you can use the
+-- 'Scraper'.
+module MiniScalp.Types
+  ( -- * Scraper types
+    ScraperT,
+    Scraper,
+    runScraperT,
+    runScraper,
+
+    -- * Auxiliary types
+    ScrapeContext,
+    Predicate,
+  )
+where
+
+import Control.Applicative
+import Control.Monad.Identity
+import Control.Monad.Reader
+import Control.Monad.Trans.Maybe
+import Zenacy.HTML (HTMLNode)
+
+-- | Context of the scraping operation
+--
+-- The first element represents the stack of parent nodes so we can inspect the predecessors of the current element.
+-- Note that the stack is "reversed", meaning the first element is the immediate parent.
+--
+-- The second element represents the currently focussed node.
+type ScrapeContext = ([HTMLNode], HTMLNode)
+
+-- | Predicate to match HTML elements.
+--
+-- Gets passed the current stack as well as the current node.
+type Predicate = [HTMLNode] -> HTMLNode -> Bool
+
+-- | Main Monad of the scraper machinery.
+newtype ScraperT m a = MkScraperT (ReaderT ScrapeContext (MaybeT m) a)
+  deriving (Functor, Applicative, Alternative, Monad, MonadReader ScrapeContext, MonadPlus)
+
+instance MonadTrans ScraperT where
+  lift = MkScraperT . lift . lift
+
+-- | Runs the given scraper.
+runScraperT ::
+  -- | The scraper to run.
+  ScraperT m a ->
+  -- | The initial HTML node (usually the document root).
+  HTMLNode ->
+  -- | The resulting scraped value.
+  m (Maybe a)
+runScraperT (MkScraperT s) n = runMaybeT $ runReaderT s ([], n)
+
+-- | Alias for Scrapers that don't need an additional monadic context.
+type Scraper a = ScraperT Identity a
+
+-- | Analogue to 'runScraperT'.
+runScraper :: Scraper a -> HTMLNode -> Maybe a
+runScraper s n = runIdentity $ runScraperT s n
author	Daniel Schadt <kingdread@gmx.de>	2023-12-02 11:52:18 +0100
committer	Daniel Schadt <kingdread@gmx.de>	2023-12-02 11:52:18 +0100
commit	430c17097cba8c970c2826be0e622b6cb7bb6818 (patch)
tree	33f63a17c359816b176aa364c130c32257e8cc4b /src
download	MiniScalp-430c17097cba8c970c2826be0e622b6cb7bb6818.tar.gz MiniScalp-430c17097cba8c970c2826be0e622b6cb7bb6818.tar.bz2 MiniScalp-430c17097cba8c970c2826be0e622b6cb7bb6818.zip