From 430c17097cba8c970c2826be0e622b6cb7bb6818 Mon Sep 17 00:00:00 2001 From: Daniel Schadt Date: Sat, 2 Dec 2023 11:52:18 +0100 Subject: head commit --- src/MiniScalp.hs | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 src/MiniScalp.hs (limited to 'src/MiniScalp.hs') diff --git a/src/MiniScalp.hs b/src/MiniScalp.hs new file mode 100644 index 0000000..f0e3e11 --- /dev/null +++ b/src/MiniScalp.hs @@ -0,0 +1,56 @@ +-- | +-- Module : MiniScalp +-- Description : Wrapper around zenacy-html for web scraping +-- Copyright : (c) Daniel Schadt, 2023 +-- License : MIT +-- Maintainer : sample@email.com +-- Stability : experimental +-- Portability : POSIX +-- +-- MiniScalp is a wrapper around [@zenacy-html@](https://hackage.haskell.org/package/zenacy-html) to do web scraping. +-- +-- As @zenacy-html@ implements HTML parsing according to the WHATWG standard, this should produce results equivalent to +-- what your browser produces. +-- +-- = DOM navigation +-- +-- The interface of MiniScalp is similar to that of [@scalpel@](https://hackage.haskell.org/package/scalpel) in the +-- sense that you are provided with a monadic interface to the DOM. The bread and butter function is +-- 'MiniScalp.Query.chroot', which allows you to focus on a specific subpart of the DOM by using +-- 'MiniScalp.Types.Predicate' functions. +-- +-- A number of helpful predicates are defined in "MiniScalp.Predicates". +-- +-- = Example +-- +-- > import Control.Monad +-- > import Data.Maybe +-- > import Data.Text +-- > import MiniScalp.Predicates +-- > import MiniScalp.Query +-- > import MiniScalp.Sources +-- > import MiniScalp.Types +-- > +-- > data MensaLine = MensaLine Text [Text] deriving (Show) +-- > +-- > mensaScraper :: Scraper [MensaLine] +-- > mensaScraper = chroots (tag "tr" @& hasClass "mensatype_rows") $ do +-- > name <- chroot ("td" @: [hasClass "mensatype"]) text' +-- > meals <- chroots ("td" @: [hasClass "menu-title"]) text' +-- > return $ MensaLine name meals +-- > +-- > main :: IO () +-- > main = do +-- > scraped <- fromJust <$> scrapeFile "mensa.html" mensaScraper +-- > forM_ scraped $ \(MensaLine name meals) -> do +-- > putStrLn $ unpack name +-- > forM_ meals $ \meal -> putStrLn (" " ++ unpack meal) +-- > putStrLn "" +-- +-- = Modules +-- +-- * "MiniScalp.Types": Basic definitions of the needed types. +-- * "MiniScalp.Query": Data extraction routines. +-- * "MiniScalp.Predicates": Predicates to select the wanted nodes. +-- * "MiniScalp.Sources": Various data sources. +module MiniScalp () where -- cgit v1.2.3