From 430c17097cba8c970c2826be0e622b6cb7bb6818 Mon Sep 17 00:00:00 2001 From: Daniel Schadt Date: Sat, 2 Dec 2023 11:52:18 +0100 Subject: head commit --- src/MiniScalp.hs | 56 ++++++++++++++++++++++++++++ src/MiniScalp/Predicates.hs | 72 +++++++++++++++++++++++++++++++++++ src/MiniScalp/Query.hs | 91 +++++++++++++++++++++++++++++++++++++++++++++ src/MiniScalp/Sources.hs | 32 ++++++++++++++++ src/MiniScalp/Types.hs | 61 ++++++++++++++++++++++++++++++ 5 files changed, 312 insertions(+) create mode 100644 src/MiniScalp.hs create mode 100644 src/MiniScalp/Predicates.hs create mode 100644 src/MiniScalp/Query.hs create mode 100644 src/MiniScalp/Sources.hs create mode 100644 src/MiniScalp/Types.hs (limited to 'src') diff --git a/src/MiniScalp.hs b/src/MiniScalp.hs new file mode 100644 index 0000000..f0e3e11 --- /dev/null +++ b/src/MiniScalp.hs @@ -0,0 +1,56 @@ +-- | +-- Module : MiniScalp +-- Description : Wrapper around zenacy-html for web scraping +-- Copyright : (c) Daniel Schadt, 2023 +-- License : MIT +-- Maintainer : sample@email.com +-- Stability : experimental +-- Portability : POSIX +-- +-- MiniScalp is a wrapper around [@zenacy-html@](https://hackage.haskell.org/package/zenacy-html) to do web scraping. +-- +-- As @zenacy-html@ implements HTML parsing according to the WHATWG standard, this should produce results equivalent to +-- what your browser produces. +-- +-- = DOM navigation +-- +-- The interface of MiniScalp is similar to that of [@scalpel@](https://hackage.haskell.org/package/scalpel) in the +-- sense that you are provided with a monadic interface to the DOM. The bread and butter function is +-- 'MiniScalp.Query.chroot', which allows you to focus on a specific subpart of the DOM by using +-- 'MiniScalp.Types.Predicate' functions. +-- +-- A number of helpful predicates are defined in "MiniScalp.Predicates". +-- +-- = Example +-- +-- > import Control.Monad +-- > import Data.Maybe +-- > import Data.Text +-- > import MiniScalp.Predicates +-- > import MiniScalp.Query +-- > import MiniScalp.Sources +-- > import MiniScalp.Types +-- > +-- > data MensaLine = MensaLine Text [Text] deriving (Show) +-- > +-- > mensaScraper :: Scraper [MensaLine] +-- > mensaScraper = chroots (tag "tr" @& hasClass "mensatype_rows") $ do +-- > name <- chroot ("td" @: [hasClass "mensatype"]) text' +-- > meals <- chroots ("td" @: [hasClass "menu-title"]) text' +-- > return $ MensaLine name meals +-- > +-- > main :: IO () +-- > main = do +-- > scraped <- fromJust <$> scrapeFile "mensa.html" mensaScraper +-- > forM_ scraped $ \(MensaLine name meals) -> do +-- > putStrLn $ unpack name +-- > forM_ meals $ \meal -> putStrLn (" " ++ unpack meal) +-- > putStrLn "" +-- +-- = Modules +-- +-- * "MiniScalp.Types": Basic definitions of the needed types. +-- * "MiniScalp.Query": Data extraction routines. +-- * "MiniScalp.Predicates": Predicates to select the wanted nodes. +-- * "MiniScalp.Sources": Various data sources. +module MiniScalp () where diff --git a/src/MiniScalp/Predicates.hs b/src/MiniScalp/Predicates.hs new file mode 100644 index 0000000..bcf0a11 --- /dev/null +++ b/src/MiniScalp/Predicates.hs @@ -0,0 +1,72 @@ +-- | Various predicates to match HTML nodes. +module MiniScalp.Predicates + ( tag, + (@&), + (@|), + (@=), + (@/), + (@:), + hasClass, + ) +where + +import Data.List (tails) +import Data.Text (Text) +import MiniScalp.Types +import Zenacy.HTML + +-- | Matches if both predicates match. +-- +-- > tag "tr" @& hasClass "menu-title" +(@&) :: Predicate -> Predicate -> Predicate +a @& b = \s n -> a s n && b s n + +infixl 8 @& + +-- | Matches if one of the predicates matches. +-- +-- > tag "thead" @| tag "tbody" +(@|) :: Predicate -> Predicate -> Predicate +a @| b = \s n -> a s n || b s n + +infixl 7 @| + +-- | Matches if the current node has the given tag. +-- +-- > tag "p" +tag :: Text -> Predicate +tag t _ = htmlElemHasName t + +-- | Matches if the current node has the given attribute and value. +-- +-- > "id" @= "description" +(@=) :: Text -> Text -> Predicate +k @= v = \_ n -> htmlElemHasAttrVal k v n + +infix 9 @= + +-- | Matches if the node has the given class. +-- +-- > hasClass "src" +hasClass :: Text -> Predicate +hasClass c _ = htmlElemClassesContains c + +-- | Matches if the right predicate matches the current node, and the left operand matches a predecessor node. +-- +-- > tag "p" @/ tag "img" +(@/) :: Predicate -> Predicate -> Predicate +a @/ b = \s n -> b s n && or (zipWith a (drop 1 $ tails s) s) + +infixl 1 @/ + +-- | Shorthand to find a specific tag with the given predicates. +-- +-- The following two are equivalent: +-- +-- > "p" @: [hasClass "text", "id" @= "description"] +-- +-- and +-- +-- > tag "p" @& hasClass "text" @& "id" @= "description" +(@:) :: Text -> [Predicate] -> Predicate +t @: a = tag t @& \s n -> all (\p -> p s n) a diff --git a/src/MiniScalp/Query.hs b/src/MiniScalp/Query.hs new file mode 100644 index 0000000..07784fb --- /dev/null +++ b/src/MiniScalp/Query.hs @@ -0,0 +1,91 @@ +-- | Data retrieval functions. +module MiniScalp.Query + ( -- * Simple accessors + node, + parents, + text, + text', + attribute, + html, + + -- * Complex navigation + retrieve, + chroots, + chroot, + ) +where + +import Control.Applicative (empty, optional) +import Control.Monad (forM) +import Control.Monad.Reader (asks, local) +import Data.Maybe (catMaybes) +import Data.Text (Text) +import Data.Text qualified as T +import MiniScalp.Types +import Zenacy.HTML + +-- | Retrieves the current node. +node :: (Monad m) => ScraperT m HTMLNode +node = asks snd + +-- | Retrieves the parents of the current node. +-- +-- Node that the first entry is the immediate parent. +parents :: (Monad m) => ScraperT m [HTMLNode] +parents = asks fst + +-- | Retrieves the text of the current node. +text :: (Monad m) => ScraperT m Text +text = node >>= \n -> maybe empty return $ htmlElemText n + +-- | Recursively retrieves the text of the current node and all children nodes. +text' :: (Monad m) => ScraperT m Text +text' = recurseText <$> node + where + recurseText :: HTMLNode -> Text + recurseText (HTMLDocument _ c) = T.concat $ map recurseText c + recurseText (HTMLDoctype {}) = mempty + recurseText (HTMLFragment _ c) = T.concat $ map recurseText c + recurseText (HTMLElement _ _ _ c) = T.concat $ map recurseText c + recurseText (HTMLTemplate {}) = mempty + recurseText (HTMLText t) = t + recurseText (HTMLComment _) = mempty + +-- | Retrieves the value of the attribute with the given name. +-- +-- Fails if the attribute does not exist. +attribute :: (Monad m) => Text -> ScraperT m Text +attribute a = node >>= \n -> maybe empty return $ htmlElemGetAttr a n + +-- | Retrieves the rendered HTML of the current node. +-- +-- Note that this may not correspond to the original source, as it is re-rendered from the DOM. +html :: (Monad m) => ScraperT m Text +html = htmlRender <$> node + +-- | Retrieves all child contexts for which the given predicate matches. +retrieve :: (Monad m) => Predicate -> ScraperT m [ScrapeContext] +retrieve predicate = do + n <- node + ps <- parents + let includeRoot = predicate ps n + children <- concat <$> forM (htmlNodeContent n) (\child -> local (const (n : ps, child)) (retrieve predicate)) + return $ if includeRoot then (ps, n) : children else children + +-- | Finds the elements according to the predicate and then executes the given scraper in their contexts. +-- +-- If a subscraper fails, it is silently skipped. +chroots :: (Monad m) => Predicate -> ScraperT m a -> ScraperT m [a] +chroots p s = do + es <- retrieve p + catMaybes <$> forM es (\ctx -> local (const ctx) $ optional s) + +-- | Like 'chroots', but only executes the scraper in the first context. +-- +-- If no matching elements are found, this scraper fails. +chroot :: (Monad m) => Predicate -> ScraperT m a -> ScraperT m a +chroot p s = do + cs <- chroots p s + case cs of + a : _ -> return a + [] -> empty diff --git a/src/MiniScalp/Sources.hs b/src/MiniScalp/Sources.hs new file mode 100644 index 0000000..959b41a --- /dev/null +++ b/src/MiniScalp/Sources.hs @@ -0,0 +1,32 @@ +-- | Various entry points for scrapers using different data sources. +module MiniScalp.Sources + ( -- * Scraping in-memory text + scrapeTextT, + scrapeText, + -- * Scraping local files + scrapeFileT, + scrapeFile, + ) +where + +import Data.Functor ((<&>)) +import Data.Text (Text) +import Data.Text.IO qualified as T +import MiniScalp.Types +import Zenacy.HTML (htmlParseEasy) + +-- | Parse and scrape the given 'Text'. +scrapeTextT :: Text -> ScraperT m a -> m (Maybe a) +scrapeTextT text scraper = runScraperT scraper $ htmlParseEasy text + +-- | Specialised version of 'scrapeTextT'. +scrapeText :: Text -> Scraper a -> Maybe a +scrapeText text scraper = runScraper scraper $ htmlParseEasy text + +-- | Read the file from the given path and scrape it. +scrapeFileT :: FilePath -> ScraperT m a -> IO (m (Maybe a)) +scrapeFileT path scraper = T.readFile path <&> flip scrapeTextT scraper + +-- | Specialised version of 'scrapeFileT'. +scrapeFile :: FilePath -> Scraper a -> IO (Maybe a) +scrapeFile path scraper = T.readFile path <&> flip scrapeText scraper diff --git a/src/MiniScalp/Types.hs b/src/MiniScalp/Types.hs new file mode 100644 index 0000000..cb6e366 --- /dev/null +++ b/src/MiniScalp/Types.hs @@ -0,0 +1,61 @@ +-- | +-- Type definitions for MiniScalp. +-- +-- The main type is a 'ScraperT', which provides the monadic interface to the scraper. It is a monad transformer, so you +-- can use it in combinations with other monads in your scrapers. If you do not need any other monads, you can use the +-- 'Scraper'. +module MiniScalp.Types + ( -- * Scraper types + ScraperT, + Scraper, + runScraperT, + runScraper, + + -- * Auxiliary types + ScrapeContext, + Predicate, + ) +where + +import Control.Applicative +import Control.Monad.Identity +import Control.Monad.Reader +import Control.Monad.Trans.Maybe +import Zenacy.HTML (HTMLNode) + +-- | Context of the scraping operation +-- +-- The first element represents the stack of parent nodes so we can inspect the predecessors of the current element. +-- Note that the stack is "reversed", meaning the first element is the immediate parent. +-- +-- The second element represents the currently focussed node. +type ScrapeContext = ([HTMLNode], HTMLNode) + +-- | Predicate to match HTML elements. +-- +-- Gets passed the current stack as well as the current node. +type Predicate = [HTMLNode] -> HTMLNode -> Bool + +-- | Main Monad of the scraper machinery. +newtype ScraperT m a = MkScraperT (ReaderT ScrapeContext (MaybeT m) a) + deriving (Functor, Applicative, Alternative, Monad, MonadReader ScrapeContext, MonadPlus) + +instance MonadTrans ScraperT where + lift = MkScraperT . lift . lift + +-- | Runs the given scraper. +runScraperT :: + -- | The scraper to run. + ScraperT m a -> + -- | The initial HTML node (usually the document root). + HTMLNode -> + -- | The resulting scraped value. + m (Maybe a) +runScraperT (MkScraperT s) n = runMaybeT $ runReaderT s ([], n) + +-- | Alias for Scrapers that don't need an additional monadic context. +type Scraper a = ScraperT Identity a + +-- | Analogue to 'runScraperT'. +runScraper :: Scraper a -> HTMLNode -> Maybe a +runScraper s n = runIdentity $ runScraperT s n -- cgit v1.2.3