20080122

Irrefutable Pattern Love.

Irrefutable patterns are awesome. If anyone tells you differently, well, they're probably right. I'm 0 for 1 regarding challenging people on the inner workings of Haskell thus far. Anyway, this exerpt from my code is the exact reason irrefutable patterns rock.

 rexn ns pps = let ( ~( xs , rps ) ,
~( ~( nxs ) ,
~( rxs , rrps ) ) ) = ( exn nxs pps ,
case rps of
('?':'?':rr) -> ( ( ns ) ,
( ns ++ xs , rr ) )
('?':rr) -> ( ( ns ) ,
( xs ++ ns , rr ) )
('*':'?':rr) -> ( ( ns ++ xs ) ,
( ns ++ xs , rr ) )
('*':rr) -> ( ( xs ++ ns ) ,
( xs ++ ns , rr ) )
('+':'?':rr) -> ( ( ns ++ xs ) ,
( xs , rr ) )
('+':rr) -> ( ( xs ++ ns ) ,
( xs , rr ) )
_ -> ( ( ns ) ,
( xs , rps ) )
)
in ( rxs , rrps )


reading key :: xs - extracted nodes , rps - remaining-pattern-string , nxs - next-nodes ( what the nodes being created will use as the next step when executing the regex ) , rxs - the set of nodes generated for the section of pattern being interpreted , rrps - remaining remaining pattern string - the final amount after accounting for repitition operators.

See those tildes? Those tell the haskell compiler that I guarantee that the variables being bound there will eventually be filled in and that I want it to assume the patterns will match irrefutably. Guess thats where they got the name. This allows me to do some fun things. If you stare into that code for a few seconds, you'll see that the function exn depends on the variable nxs, which is generated in the case statement, which determines its value based on the remaining pattern string ( rps ) output by exn. For bonus points the next-nodes ( nxs ) sometimes contain the node being generated to create a loop.

This sort function interdependance would explode the stack of any other language with ease. Not Haskell.

The source to the newest iteration of my regex engine is follows. It is incomplete, but the parts that have been implemented seem to be functioning fine.

-- regular expression engine -- (c) 2008 michael speer

import Char ( isSpace )
-- import Debug.Trace ( trace )

xor :: Bool -> Bool -> Bool
xor True a = not a
xor False a = a

data RxToken = RxStart -- matchable start of target string
| RxChar Char -- a literal character to match
| RxBound -- inserted wherever alphanums touch whitespace
| RxEnd -- matchable end of target string
| RxEOF -- an additional token to push through to catch anything trying for RxEnd
-- RxEOF is never matched.
deriving ( Show )

rxTokenize tts = RxStart : case tts of
[] -> RxEnd : RxEOF : []
tts@(t:_) -> case not $ isSpace t of
True -> RxBound : rxt tts
False -> rxt tts
where
rxt (t:[]) | not $ isSpace t = RxChar t : RxBound : RxEnd : RxEOF : []
| otherwise = RxChar t : RxEnd : RxEOF : []
rxt (t:ts@(t':_)) | isSpace t `xor` isSpace t' = RxChar t : RxBound : rxt ts
| otherwise = RxChar t : rxt ts

data RxTransform = RxTransform ( RxNode -> RxToken -> [ RxNode ] )
| RxNullTransform

data RxNode = RxActive { rxTransforms :: [RxTransform] ,
rxMatched :: String ,
rxNumSubs :: Integer ,
rxSubExprs :: [ String ] }
| RxComplete { rxMatched :: String ,
rxNumSubs :: Integer ,
rxSubExprs :: [ String ] }

instance Show RxNode where
show (RxComplete matched _ _) = "<rx|matched:[" ++ matched ++ "]>"

data RxDepth = RxTop | RxSub deriving ( Show )

rxCompile pps = let ( xs , rps ) = oexn [success] RxTop pps
in case length rps of
0 -> RxActive { rxTransforms = xs ,
rxMatched = [] ,
rxNumSubs = 0 ,
rxSubExprs = [] }
_ -> error $ "Not all of pattern consumed : remains : " ++ rps
where
-- or together different expression sections -- (a|b|c)
oexn ns RxTop [] = ( ns , [] )
oexn _ RxSub [] = error "Pattern ended while still in sub expression"
oexn ns d pps = let ( ~( xs , rps ) ,
~( nxs , nrps ) ) = ( aexn ns pps ,
case rps of
('|':rr) -> let ( inxs , irps ) = oexn ns d rr
in ( xs ++ inxs , irps )
(')':rr) -> case d of
RxTop -> error "Erroneous close parenthesis in pattern "
RxSub -> ( xs , rr )
[] -> case d of
RxTop -> ( xs , [] )
RxSub -> error "End of pattern while still in sub expression" )
in ( nxs , nrps )
-- and together extracted nodes in a given expression segment -- abd?dfs
aexn ns pps = let ( ~( xs , rps ) ,
~( nxs , nrps ) ) = ( rexn nxs pps ,
case rps of
('|':_) -> ( ns , rps )
(')':_) -> ( ns , rps )
[] -> ( ns , rps )
_ -> aexn ns rps )
in ( xs , nrps )
-- replication application - weee!
rexn ns pps = let ( ~( xs , rps ) ,
~( ~( nxs ) ,
~( rxs , rrps ) ) ) = ( exn nxs pps ,
case rps of
('?':'?':rr) -> ( ( ns ) ,
( ns ++ xs , rr ) )
('?':rr) -> ( ( ns ) ,
( xs ++ ns , rr ) )
('*':'?':rr) -> ( ( ns ++ xs ) ,
( ns ++ xs , rr ) )
('*':rr) -> ( ( xs ++ ns ) ,
( xs ++ ns , rr ) )
('+':'?':rr) -> ( ( ns ++ xs ) ,
( xs , rr ) )
('+':rr) -> ( ( xs ++ ns ) ,
( xs , rr ) )
_ -> ( ( ns ) ,
( xs , rps ) )
)
in
( rxs , rrps )
-- extract node ( including an entire subexpression as a single node )
exn _ ('?':_) = error "Bad question mark operator"
exn _ ('*':_) = error "Bad splat operator"
exn _ ('+':_) = error "Bad plus sign operator"
exn ns ('(':ps) = oexn ns RxSub ps
exn ns (p:ps) = ( [ RxTransform ( \rxn k -> case k of
(RxChar c) -> if c == p
then
[ RxActive { rxTransforms = ns ,
rxMatched = c : rxMatched rxn ,
rxNumSubs = rxNumSubs rxn ,
rxSubExprs = rxSubExprs rxn } ]
else
[]
(RxStart) -> [rxn]
(RxBound) -> [rxn]
_ -> []
) ] ,
ps )

exn ns [] = error "can this be reached?"

success = RxTransform ( \ rxn k -> [ RxComplete { rxMatched = reverse $ rxMatched rxn ,
rxNumSubs = rxNumSubs rxn ,
rxSubExprs = map reverse $ rxSubExprs rxn } ] )


rxExec n tts = iexec [n] (rxTokenize tts)
where
iexec (win@(RxComplete _ _ _ ):_) _ = Just win
iexec [] _ = Nothing
iexec nns (k:ks) = iexec ( concatMap ( \n -> case n of
(RxComplete _ _ _) -> [n]
a@(RxActive _ _ _ _) -> concatMap (\xf -> case xf of
(RxTransform f) -> f a k
(RxNullTransform) -> []
) (rxTransforms a) ) nns ) ks

main = do
print $ rxTokenize "this is a test"
print $ rxExec (rxCompile "hello|world") "hello"
print $ rxExec (rxCompile "hello|world") "world"
print $ rxExec (rxCompile "abcde|ab") "abcd"
print $ rxExec (rxCompile "ab?c") "ac"
print $ rxExec (rxCompile "ab?c") "abc"
print $ rxExec (rxCompile "ab*c") "ac"
print $ rxExec (rxCompile "ab*c") "abc"
print $ rxExec (rxCompile "ab*c") "abbbc"
print $ rxExec (rxCompile "ab+c") "ac"
print $ rxExec (rxCompile "ab+c") "abc"
print $ rxExec (rxCompile "ab+c") "abbbbbc"
print $ rxExec (rxCompile "(a|b)+") "aaabbbabaaababaaabbbabbaba"
print $ rxExec (rxCompile "abc|") "zyx"


Sweet.

No comments:

About Me

(2) the subculture of the compulsive programmer, whose ethics prescribe that one silly idea and a month of frantic coding should suffice to make him a life-long millionaire. --ewd1036