var EXPORTED_SYMBOLS = ["XpatternTester"];

// Unit tests for the xpattern complex

// Here is how to run testss, since I always tend to forget:
//
// cd .../cf/engine/src
// make
// export CF_BASE_PATH=`cd ../../; pwd`
// ./cfrun -u util/xpatternTester
//
// Note for MacOS: export CF_APP_PATH=`pwd`


Components.utils.import('resource://indexdata/util/xmlHelper.js');
Components.utils.import('resource://indexdata/util/xpattern.js');
Components.utils.import('resource://indexdata/util/xpatternText.js');

function XpatternTester() {
}

// A helper so we can write KNOWNFAIL(4) in the tests, instead of -4
// That makes it much more visible. When working with a known fail test,
// put -KNOWNFAIL(4) in the test (with a minus in front of it, so we get
// a positive number, and the test will fail. Or 9999+KNOWNFAIL(4), to
// guarantee a failure)
function KNOWNFAIL(x) {
  return -x;
}

//// Define tests here
// This is an array of tests.
// each test is an array with three components:
//  - name
//  - html page fragment, excluding html and body tags
//  - array of xpatterns and expected hitcounts
//    There are two counts, one for number of hits, and the other
//    for total number of hit elements. If these match, the pattern
//    usually matches what expected. (If not, enable debugging in
//    xpattern, and see what goes on...) When developing tests
//    you can usually deduct the right number of hits. Then just
//    put -1 in exp2, see the test fail, and copy the right number
//    in. Note that it also counts the empty 'break' hits.
//
// If you have a test case that you know will fail, comment it well, and
// put a negative value in exp2. That way, the failure gets noted in
// unit tests, and if some future change happens to fix it, you will get
// notified, and should re-enable the test.


var testArray = [

  [ "dummy", // This is just a dummy test example
    "<h1>dummy test</h1>",
    [
      [ "H1 $title", 1,2 ]
    ]
  ], // dummy
  
  [ "tails",  // test that the text tails are collected when sub-pattern ends
              // Inspired by  BUG-CP-3272 tests below.
    "<h1>tails</h1>" +
    "<div>plain div 1</div>" +
    "<div>div 2 with <b>bold</b></div>" +
    "<div>div 3 with <b>bold</b> inside</div>" +
    "<div>div 4 with <b>bold</b><b>twice</b>inside</div>" +
    "<div>div 5 with <b>bold</b><i>and italics</i>inside</div>"+
    "",
    [
      [ "DIV $div", 5,19 ],
      [ "DIV $div { B $bold } ", 4, 21 ],
      [ "DIV $div { B $bold1 : B $bold2 } ", 1, 7 ],
      [ "DIV $div { B $bold : I $italics } ", 1, 7 ],
      [ "DIV $div { B ? $bold } ", 5, 23 ],
      [ "DIV $div { B + $bold } ", 4, 22 ],
      [ "DIV $div { B * $bold } ", 5, 24 ],
      [ "DIV $div { (B $bold) } ", 4, 21 ],
      [ "DIV $div { (B $bold1 : B $bold2) } ", 1, 7 ],
      [ "DIV $div { (B $bold : I $italics) } ", 1, 7 ],
      [ "DIV $div { (B ? $bold) } ", 5, 23 ],
      [ "DIV $div { (B + $bold) } ", 4, 22 ],
      [ "DIV $div { (B * $bold) } ", 5, 24 ],
      [ "DIV $div { (B $bold)? } ", 5, 23 ],
      [ "DIV $div { (B $bold)+ } ", 4, 22 ],
      [ "DIV $div { (B $bold)* } ", 5, 24 ],
    ]
  ], // dummy
  
  [ "nongreedy" ,
    "<h1>test page</h1>" +
      "<a href='...'>title 1</a><br/>" +
      "<a href='...'>title 2</a><b>first author</b><br/>" +
      "<a href='...'>title 3</a><b>first</b><b>second</b><br/>" +
      "<a href='...'>title 4</a><b>first</b><b>second</b><b>third</b><br/>",
    [
    // Tests for optional '?' and '??'
      // Test optional. Both '?' and '??' should match the two first hits,
      // but no more, because of the BR
      [ "A $title : B ? $author : BR", 2, 6 ], // match 1 and 2.
      [ "A $title : B ?? $author : BR", 2, 6 ], // match 1 and 2.
      // Test that '??' prefers not to match. Should get 2,3,4
      // should get nothing in $skipped, thus 3 hits of 4 elems each.
      [ "A $title : B ?? $skipped : B $author", 3, 12 ],
      // Same test with a regular '?', should get something in $skipped
      // in tests 3 and 4. 
      [ "A $title : B ? $skipped : B $author", 3, 16 ],
    // Tests for repeat '+' and '+?'
      // Regular repeat, anchored by the BR. Hits 2, 3, and 4
      [ "A $title : B + $author : BR", 3, 18 ],
      // Non-greedy repeat, anchored by the BR. Same result
      [ "A $title : B +? $author : BR", 3, 18 ],
      // Un-anchored repeat. Gets 2 hits (3,4) with last author in $foo
      [ "A $title : B + $author : B $foo", 2, 14 ],
      // Same with a non-greedy repeat. 2 hits (3,4), each with only one
      // author in $author, and one in $foo.
      [ "A $title : B +? $author : B $foo", 2, 12 ],
    // Tests for opt.repeat '*' and '*?'
      // Get them all
      [ "A $title : B * $author : BR", 4, 20 ],
      // Same thing with *?, since the BR anchors the match
      [ "A $title : B *? $author : BR", 4, 20 ],
      // Same tests, without anchoring. Should get all authors
      [ "A $title : B * $author ", 4, 20 ],
      // But with a *?, should only take the first author
      [ "A $title : B *? $author ", 4, 14 ],
    // Test that ANY* works.
      // This gets just one hit, with "title1" and the last author, "third",
      // since the ANY* is so greedy it skips the A's and other stuff
      [ "A $title : ANY * : B $author ", 1, 4 ],
      // But with *?, it should prefer not to match at all, and
      // get the first authors
      // The first hit is still wrong, as it gets title1 and the
      // 'first author' of the second hit. The remaining two are all right.
      [ "A $title : ANY *? : B $author ", 3, 12 ],
      // Anchor the test to the BR. Even then gets only one hit
      [ "A $title : ANY * : B $author : BR ", 1, 4 ],
      // but with non-greedy *?, get 3 hits
      [ "A $title : ANY *? : B $author : BR ", 3, 12 ],
      // same, but catch the skipped things in $debug
      // was useful when developing, leave it here...
      [ "A $title : ANY *? $debug : B $author : BR ", 3, 21 ],
    ]
  ], // non-greedy
  
  [ "or" ,
    "<h1>test page</h1>" +
      "<a href='..'>title 1</a><br/>" +
      "<a href='..'>title 2</a><b>au</b><br/>" +
      "<a href='..'>title 3</a><i>id</i><br/>" +
      "<a href='..'>title 4</a><b>au</b><i>id</i><br/>" +
      "<a href='..'>title 5</a><i>id</i><b>au</b><br/>" +
      "<a href='..'>title 6</a><b>au</b><b>au2</b><i>id</i><i>id2</i><br/>" +
      "<a href='..'>title 7</a><b>au</b><i>id</i><b>au2</b><i>id2</i><br/>" +
      "<a href='..'>title 8</a><i>id</i><i>id2</i><br/>" +
      "",
    [
      // simple match. Gets title 3 only
      [ "A $title : I $id  : BR ", 1, 4 ],
      // Simple alternative, gets titles 2 and 3
      [ "A $title : ( I $id | B $author ) : BR ", 2, 8 ],
      // Repeated alternative, gets titles 2 to 7
      [ "A $title : ( I $id | B $author ) + : BR ", 7, 46 ],
      // Repeated opt. alternative, gets them all
      [ "A $title : ( I $id | B $author ) * : BR ", 8, 48 ],
      // Repeated opt. alternative, gets them all, even without closing BR
      [ "A $title : ( I $id | B $author ) * ", 8, 48 ],

      // Match either I's or B's, until first mismatch
      [ "A $title : ( I + $id | B + $author )", 7, 32  ],

      // Alternative with repeated elements, gets titles
      // (because of the BR, returns only those where we get the whole pattern
      // that is titles 2, 3, and 8 , with only one type of followers.
      [ "A $title : ( I + $id | B + $author ) : BR ", 3, 14 ], 
      // Repeated or with repeated elements, gets titles
      [ "A $title : ( I + $id | B + $author ) + : BR ", 7, 46 ],
    ]
  ], // or
  
  [ "attr" ,
    "<h1>test page</h1>" +
      "<span>title 1</span><br/>" +
      "<span class='fooclass'>title 2</span><br/>" +
      "<span style=\"foostyle\">title 3</span><br/>" +
      "<span class='fooclass' style='foostyle'>title 4</span><br/>" +
      "<span style='foostyle' class='fooclass'>title 5</span><br/>" +
      "<span style='barstyle' class='barclass'>title 6</span><br/>" +
      "",
    [
      // simple match. Gets them all
      [ "SPAN $title : BR ", 6, 12 ],
      // Get all that have a class. That is 2,4,5,6
      // These FAIL, looks like everything has a class, even if empty
      [ "SPAN $title [ @class $class] : BR ", 4, KNOWNFAIL(22)  ],  
      [ "SPAN $title [ @class ] : BR ", 4, KNOWNFAIL(8) ],
      
      // Get all with a given value.
      // (the backslash is just to quote the quote inside a quoted string
      [ "SPAN $title [ @class = \"fooclass\" ] : BR ", 3, 6 ],
      
      // same test as above, with regex
      //[ "SPAN $title [ @class ~ \"fooclass\" ] : BR ", 3, 6 ],
      [ "SPAN $title [ @class ~ /fooclass/ ] : BR ", 3, 6 ],
      // real regex test
      //[ "SPAN $title [ @class ~ \"[fF].*?o+cl.s+\" ] : BR ", 3, 6 ],
      [ "SPAN $title [ @class ~ /[fF].*?o+cl.s+/ ] : BR ", 3, 6 ],


      // Get all that match, save in $class
      [ "SPAN $title [ @class =\"fooclass\" $class ] : BR ", 3, 12 ],

      // Get all that match a class and style
      [ "SPAN $title [ @class =\"fooclass\" , @style=\"foostyle\" ] : BR ", 2, 4 ],

      // Get all that match fooclass and extract style: 
      [ "SPAN $title [ @class =\"fooclass\" , @style $style ] : BR ", 2, KNOWNFAIL(2) ],
      // TODO - Fails, gets all three with fooclass, even if no style, as in 2
      // in title2

      // TODO See bug 3407 which describes the problems with the failing tests.
      //  Basically, the parser and matcher are out of sync about the meaning
      // of different combinations.

      // Test the regular expression match
      [ "SPAN $title [ /title 4/ ] : BR ", 1, 2 ],
      
      // And a more complex regexp
      [ "SPAN $title [ /title [2-4]/ ] : BR ", 3, 6 ],
      
      // The grammar allows multiple regexps, so test that.
      // Gets 2 hits, titles 2 and 3.
      [ "SPAN $title [ /title [2-4]/, /title [1-3]/ ] : BR ", 2, 4 ],

      // Test that a attribute and regexp match work together
      // 234 and 245 = 2 hits, 2 and 4.
      [ "SPAN $title [ /title [2-4]/, @class=\"fooclass\" ] : BR ", 2, 4 ],

      
    ]
  ], // attr
  

  [ "textnode" ,
    "<h1>test page</h1>" +
    "<b>Some title</b> <br/>" +
    "Some Author <br/>" +
    "Some year <br/>"+
    "Some description <p/>" +
    "<b>Commented <!--comment--> title</b> <br/>" +
    "Commented <!--comment--> Author <br/>" +
    "Some year <br/>"+
    "Some description <p/>" +
    "<b>A <u>very</u> <i>complex</i> title</b> <br/>" +
    "And <i>very</i> <u>complex</u> Author <br/>" +
    "Some year <br/>"+
    "Some description <p/>" +
      "",
    [
      // Just get the titles. All 3 of them
      [ "B $title ", 3, 10 ],
      // Get the titles and simple authors. Gets only the first #text
      [ "B $title : BR : #text $author",  3, 16 ],
      // Try to get all #text nodes in the author. Skips the #comment
      [ "B $title : BR : #text + $author",  3, 18 ],
      // Try to get all #text nodes in the author. Skips the #comment
      // Does not get the very complex author
      [ "B $title : BR : #text + $author : BR",  2, 11 ],

      // 4. Get all authors
      [ "B $title : BR : #text + $author : " +
      "I ? $author : #text * $author : U ? $author : #text ? $author : BR ",  3, 25 ],

      // get authors, and the comment in one
      // messy pattern, with lots of optionals, but matches all examples above
      [ "B $title : BR : #text $author : #comment ? $comment : #text ? $author :" +
        "I ? $author : #text * $author : U ? $author : #text ? $author : BR ",  3, 27 ],
        
      // Fails: misses the last word 'Author'  in 'very complex author' ###
      [ "B $title : BR : #text * $au :  ANY * $au ! BR : #text * $au : BR ",  3, KNOWNFAIL(24) ],

      // Another apporach
      [ "B $title : BR : ( #text $au | i $au | u $au ) +  : #text ? $au: BR",  3, 25 ],

      // Doesn't even need the extra #text node!
      [ "B $title : BR : ( #text $au | i $au | u $au ) + : BR",  3, 25 ],

      // This works too,but gets every part of the author into its own group
      [ "B $title : BR : ( #text $au | i $au | u $au ) + $group : BR",  3, 41 ],

      // This gets the same as above, as expected
      [ "B $title : BR : ( (#text $au | i $au | u $au ) $group ) + : BR",  3, 41 ],

      // whereas this groups all values inside the same group
      [ "B $title : BR : ( (#text $au | i $au | u $au ) +  ) $group : BR ",  3, 31 ],
      
      // Get the comment from the commented title
      [ "B $title {  #text : #comment $comment : #text }", 1,5 ],
      // Same as above, with different placing of the variable name
      // Produces more break hits
      [ "B {  #text $title : #comment $comment : #text $title }", 1,6 ],
      
      // A bit more flexible, allowing more text nodes but only one
      // (sequence of) comment in the middle. Finds all three hits,
      // but of course gets only the beginning of the third title, as it
      // has no provisions for the I and U nodes.
      [ "B {  #text $title  : #comment * $comment : #text * $title }", 3, 10 ], 
      // This was a problem (see bug 3406), but the fix for 4508 seemed
      // to fix this too. If the first text had been with a +, it would have eaten
      // the comment.

      //whereas this gets the whole title, even of the last one, as it all is
      // contained inside the B. The comment gets picked out too.
      [ "B $title {  #text : #comment * $comment : #text * }", 3,12 ],

      // The following pattern is a good workaround, using the non-greedy
      // repeat, which tries the comment before more text
      [ "B {  #text +? $title  : #comment * $comment : #text * $title }", 3, 10 ], 

      // More flexible way to do the same      
      [ "B $title { ( #comment $comment | #text ) * }", 3, 12 ],
      
      [ "B $title { ( #text | #comment $comment ) * }", 3, KNOWNFAIL(12) ],
      // TODO - This fails too, misses the comment
      // Something about skipping comments too quickly
      // See Bug 3406. After 4508, fails to find anything because it collects
      // the textnode into $title before it gets to the or-group at all.
      // After CP-3451 gets the title again, but still misses the comment
      // So, we need to put the #comment first, if in a group. Pretty unlikely case!
    ]
  ], // textnode
  
  [ "BUG-CP-3272-textnode-2" ,
      // See CP-3272 - text nodes inside or-groups go wrong
    "<h1>test page</h1>" +
    "<table>" +
    "<tr><td><a href='http://'>title 1 </a>"+
    "Written by author1 <br/>" +
    "Published by publisher 1 <br/>" +
    "</td></tr>" +
    "<tr><td><a href='http://'>title 2 </a>"+
    "Published by publisher 2 <br/>" +
    "Written by author2 <br/>" +
    "<b>Nonsense</b>" +
    "</td></tr> "+
    "</table>" +
    "",
    [ 
      [ "#text $author [/Wri/] ", 2,4 ],
      [ "#text $author [/Wri/] : br : #text $publisher [/Pub/] ", 1,4 ],
      [ "#text $publisher [/Pub/] : br :#text $author [/Wri/] ", 1,4 ],
      [ "A: #text $publisher [/Pub/] : br :#text $author [/Wri/] ", 1,4 ],
      [ "TD{A: #text $publisher [/Pub/] : br :#text $author [/Wri/] } ", 1,4 ],

      // These used to FAIL - find zero hits. Fixed 10-Oct-2012
      // by checking group/or nodes in FindMatch()
      [ "#text $publisher [/Pub/] | #text $author [/Wri/]", 4,8 ],
      [ "(#text $publisher [/Pub/] | #text $author [/Wri/])", 4,8 ],
      // This gets 8 hits, each of 1 element (au or pub), or none (BR)
      [ "(#text $publisher [/Pub/] | #text $author [/Wri/] | BR)", 8,8 ],
      // This gets two hits, with author and pub in each
      [ "(#text $publisher [/Pub/] | #text $author [/Wri/] | BR)+", 2,8 ],
      // The ANY matches everything from the beginning, so only one hit
      [ "(#text $publisher [/Pub/] | #text $author [/Wri/] | BR | ANY $rest)+", 1,9 ],
      [ "(#text $publisher [/Pub/] | #text $author [/Wri/] | BR | ANY)+", 1,0 ],
      // Repeating group
      [ "A $title : ( #text $description : BR )+ ", 2,12 ],
            
      // Strangely enough, this works. 
      [ "(#text $publisher [/Pub/] | #text $author [/Wri/] | BR | A)+", 2,8 ],
      
      // Strangely enough, this works too.
      [ "A:(#text $publisher [/Pub/] | #text $author [/Wri/] | BR )+", 2,8 ],
      // ANY does not break this!
      [ "A:(#text $publisher [/Pub/] | #text $author [/Wri/] | BR | ANY)+", 2,8 ],

      // Test the child match
      [ "TD { A $title : #text $publisher [/Pub/] : BR : #text $author [/Wri/] }", 1,6 ],
      [ "TD { (A $title : #text $publisher [/Pub/] : BR : #text $author [/Wri/]) }", 1,6 ],
      [ "TD { ( #text $description | A $title) }", 2, 4 ],

      [ "TD { ( #text $description | A $title)+ }", 2,  8  ],
      [ "TD { ( #text $publisher [/Pub/] | #text $author [/Wri/] | BR | A $title) }", 2, 4 ],
      [ "TD { ( #text $publisher [/Pub/] | #text $author [/Wri/] | BR | A $title)+  }", 2, 12 ],
        
      // Simplifying the above
      [ "TD { A $title : #text $description : BR  }", 2,8 ],
      // This FAILS, gets no hits!
      [ "TD { A $title : ( #text $description : BR ) }", 2, 8 ],
      
      [ "TD { A $title : ( #text $description : BR )+ }", 2,12 ],
      [ "TD { (#text $publisher [/Pub/] | #text $author [/Wri/] | BR | A $title)+}", 2,12 ],
      [ "TD { A $title: (#text $publisher [/Pub/] | #text $author [/Wri/] | BR )+}", 2,12 ],
    ]
  ], // BUG-CP-3272-textnode-2
  
  [ "BUG-CP-3451-textnode-3" ,
      // See CP-3451 - text nodes following optional nodes fail to match
    "<h1>test page</h1>" +
    "<table>" +
    "<tr><td><a href='http://'>title 1 </a>"+
    "First description <br/>" +
    "</td></tr>" +
    "<tr><td>"+
    "Second description<br/>" +
    "</td></tr> "+
    "<tr><td><a href='http://'>title 3 </a><a href='http://'>Subtitle-3</a>"+
    "third description <br/>" +
    "</td></tr>" +
    "<tr><td><b>BOLD-4</b><a href='http://'>title 4</a>"+
    "fourth description <br/>" +
    "</td></tr>" +
    "<tr><td><b>BOLD-5</b>"+
    "fifth description <br/>" +
    "</td></tr>" +
    "</table>" +
    "",
    [
      // Gets the titles all right
      [ "TD { A $title }", 2,4 ],
      // Gets one title and some text 
      [ "TD { A $title: #text $description }", 1,4 ],
      
      // This is the canonical case that failed 
      // Finds the first, but misses the second
      // Misses the rest, but that is as expected
      [ "TD { A? $title: #text $description }", 2, 6 ], 
      // This works
      [ "TD { A+ $title: #text $description }", 2, 10 ],
      // A simple workaround
      [ "TD { ( A $title)? : #text $description }", 2, 6 ],
      // The following used to fail, but works after the unskippedDomNode trick
      [ "TD { ( A? $title) : #text $description }", 2, 6 ],
      // Testing with a regular B tag preceding the problem
      [ "TD { B $author : A $title : #text $description }", 1,6 ],
      // Seems not to make much of a difference
      [ "TD { B $author : A? $title : #text $description }", 2, 10],
      // Test to see the description ending in $outer
      [ "TD $outer { A $title: #text $description }", 1,5 ],
      [ "TD $outer { A? $title: #text $description }", 2, 8 ],
      // Test with more optionals. Did not find anything at all!!
      [ "TD { B? $bold : A? $title : I? $italics : #text $description }", 4, 16 ],

    ]
  ], // BUG-CP-3451-textnode-3

  [ "negation-1" ,
    "<h1>test page</h1>" +
    "<img src='http://indexdata.com'/> <b>title 1</b> " +
    "description of the <i>first</i> item with <b>bold</b> text <br/>"+
    "<img src='http://indexdata.com'/> <a href='http://indexdata.com'>title 2</a> "+
    "Some <u>funny</u> text about the <i>second</i> item with <b>lots of <i>markup</i></b> <br/>"+
    "",
    [
      // Test that matches an image, title, and anything up to (but not including) the next image
      [ "IMG: ANY $title : ANY + $description ! IMG ", 2, 17 ],
      [ "IMG: ANY $title : ANY + $description ! IMG : ANY", 1, 7 ],
    ]
  ], // negation-1
  [ "negation-2" ,
    "<h1>test page</h1>" +
    "<table>" +
    "<tr><td><b>title 1</b></td></tr> " +
    "<tr><td>author 1</td></tr> " +
    "<tr><td><font color='red'>publisher</font></td></tr> " +
    "<tr><td><b>title 2</b></td></tr> " +
    "<tr><td><font color='red'>publisher 2</font></td></tr> " +
    "<tr><td><b>title 3</b></td></tr> " +
    "<tr><td>author 1</td></tr> " +
    "<tr><td><b>title 4</b></td></tr> " +
    "</table>"+
    "",
    [
      // Based on an example from DBC. The problem is not to get publisher in $author when
      // the author is not there.
      [ "TR { TD { B $title } } : " +
        "TR ? { TD $author ! TD { FONT } } : " +
        "TR ? { TD { FONT $publisher } } ", 4, 16 ],
      [ "TR { TD { B $title } } : " +
        "TR ? { TD $author } ! TR { TD { FONT } } : " +
        "TR ? { TD { FONT $publisher } } ", 4, 16 ]
      ],
  ], // negation-2

  // Note that in the next tests, we do get the extra item/item entries,
  // those are filtered out when transforming the results after parsing
  [ "group-1",
    "<h1>test page</h1>" +
    "<a href='http://url'>first author</a><b>title 1</b>" +
    "<div><b>123</b><i>available</i>more text</div>" +
    "<div><b>456</b><i>in use</i></div>" +
    "<div><b>789</b><i>lost</i></div>" +
    "<a href='http://url'>second author</a><b>title 2</b>" +
    "<div><b>42</b><i>unknown</i>more text</div>" +
    "",
    [
      [ "A $author [ @href $url ] : B $title : " +
        "( DIV { B $callno : I $available } ) $item ", 2,26 ],
    ],
  ], // group1

  // Bug 4508 - some problems with the all-optional tails of patterns
  // Original problem was with XML connector
  //     bkinfo { ( btl $title | aug  $author | isbn $isbn |  any ) * }
  // failed to match against
  //     <bkinfo/>
  // but adding a '?' in the bkinfo itself made it match.
  // Actually, this test set sidetracked a few times, but found some other
  // details. So I leave it here, and try again in the next test set.
  [ "bug-4508-A",
    "<h1>test page</h1>" +
    "<div>first<b>b1</b><i>i1</i></div><p/>" +
    "<div>second<i>i2</i><b>b2</b></div><p/>" +
    "<div>third<b>b3</b></div><p/>" +
    "<div>fourth<i>i4a</i><i>i4b</i></div><p/>" +
    "<div>fifth</div><p/>" +
    "<div>sixth<b>b6</b></div><p/>" +
    "",
    [
      // This matches all but the fifth, as expected
      [ "DIV  $text { ( B $bold | ANY ) + } ", 5, 22 ],

      // This matches all but the fifth, as expected
      [ "DIV  $text {  ANY + $inside } ", 5, 26 ],

      // This matches also the fifth, ok
      [ "DIV  $text {  ANY * $inside } ", 6, 28 ],

      // This matches all but the fifth, as expected
      // some of the inside nodes come into text, that's ok
      [ "DIV  $text { B $bold | I $italic } ", 5, 23 ],

      // This gets all hits, even the fifth, after the allOptional
      // fix around 4508.
      //[ "DIV  $text { B ? $bold | I $italic } ", 6, 23 ],
      [ "DIV  $text { B ? $bold | I $italic } ", 6, 25 ],
      // This changed behavior around CP-3451, now the | takes precedence,
      // so we try to match the I as well, and we get the i2.   ###
      // Used to be ,23.


      // This matches all hits. Again, the second line goes only to $text,
      // but that is actually OK, same as above, the B? misses the I, but
      // being optional, gets accepted with no hits.
      [ "DIV  $text { B ? $bold | I ? $italic } ", 6, 25 ],
      // CP-3451: was 23 hits, because the I 

      // This is a better way to write optional alternative stuff
      [ "DIV  $text { ( B $bold | A $url ) ? } ", 6, 23 ],


      // Here begins a wild-goose chase that turned out to be instructional,
      // so I leave it in place...

      // Does not match anything, only one hit with no text in it!
      // Actually, this tries the first /html, declares a mismatch, but since
      // optional, accepts it anyway.
      [ "DIV ? $text { ANY* $inside } ", 1,0 ],
      // But why does it show one (empty) hit? Because matchHere has to return
      // something to indicate success.

      // This fixes the previous issue by always requiring the P
      [ "DIV ? $text { ANY* $inside }: P ", 6,28 ],

      // Now with the P in place, this matches everything, all right
      // The fifth line misses the DIV (and the text 'fifth', but
      // being optional, that's OK. The P matches, but produces no
      // $variables. So we get an empty hit for 'fifth', which is OK.
      [ "DIV ? $text { ( ANY $inside ) * } :P ", 6,28 ],

      // Just a quick check, this matches all hits, but collects everything
      // into $text, with no break nodes in between.
      [ "DIV ? $text { ( ANY  ) * } :P ", 6,20 ],

      // End of chasing that wold goose, the :P proves it was a side track
      // about the whole pattern being optional. 


      // This matches all but the fifth, which indeed has not child nodes to
      // match the OR-bag. OK.
      [ "DIV $text { ( B $bold | i $italic | ANY ) + } :P ", 5,26 ],

      // The following three tests define the problem nicely:

      // This matches all six hits all right
      [ "DIV $text { ANY ? $inside  } ", 6,25 ],

      // And so does this, with the improved allOptional check
      [ "DIV $text { ( ANY ? $inside )  } ", 6,25 ],

      // This gets the 'fifth', as the group itself is optional.
      [ "DIV $text { ( ANY  $inside ) ?  } ", 6,25 ],

      // This works too. Gets a few more hit elements to start/stop the $group
      [ "DIV $text { ( ANY  $inside ) ? $group } ", 6,35 ],
       
      [ "DIV $text { #text $footext : ( ANY  $inside ) ? $group  } ", 6, 41 ],
      
      // TODO: 4508
      // These two fail on the fifth line. The DIV ought to match (and indeed
      // it starts to match), since we get an empty hit. But we miss the $text!
      [ "DIV $text { ( B $bold | i $italic | ANY ) ? } :P ", 6,25 ],
      [ "DIV $text { ( B $bold | i $italic | ANY ) * } :P ", 6,28 ],

      // This works now as well. 
      [ "DIV  $text { ( B $bold | i $italic | ANY ) * } ", 6, 28 ],

      // Nested groups work too
      [ "DIV  $text { ( ( ( B $bold | i $italic | ANY ) $foo ) $bar ) * $baz  } ", 6, 76 ],
      [ "DIV  $text { ( ( ( B $bold | i $italic | ANY ) $foo ) * $bar )  $baz  } ", 6, 71 ],
      // The last is a bit questionable, what should it give for the fifth hit?
      // The ANY matches, but does not capture the "fifth". So we get an empty group.
      // It could be argued that the "fifth" should go in $text, but that is far from
      // clear. The difference is between 70 and 71 hit elements.

      // This also succeeds, if any of the alternatives is optional, then
      // the or-group is optional too. Of course it makes no sense to have
      // anything after the optional node, if that fails to match, it gets
      // accepted anyway, and the alternatives do not get tried.
      [ "DIV $text { ( B $bold | i $italic | ANY ? )  } ", 6,25 ],
    ],
  ], // bug4508-A
  [ "bug-4508-B",
    "<xml>" +
    "<h1>test page</h1>" +
    "<e>e1<bkinfo><btl>title 1</btl></bkinfo></e>" +
    "<e>e2<bkinfo><aug>author 2</aug></bkinfo></e>" +
    "<e>e3<bkinfo><btl>title 3</btl><aug>author 3</aug></bkinfo></e>" +
    "<e>e4<bkinfo/></e>" +
    "</xml>" +
    "",
    [
      // This gets them all, of course
      [ "BKINFO $all", 4,8 ],
      // This gets all but the last, empty one
      [ "BKINFO { ANY $inside }", 3,6 ],

      // This gets only three, ok
      [ "E { BKINFO { ANY $inside } }", 3,6 ],
      // so does this. Also author3 gets into $ee
      [ "E $ee { BKINFO { ANY $inside } }", 3,13 ],
      
      // But this gets 4 hits ??!!
      // I see, the bkinfo does not match on the last one (nothing inside)
      // but we accept it anyway, since it is optional.
      [ "E { BKINFO ? { ANY $inside } }", 4,6 ],

      // This demonstrates what happens above. The subtree never matches
      // so the bkinfo doesn't either. But we accept it anyway.
      [ "E { BKINFO ? { NOTMATCHING $inside } }", 4,0 ],
      
      // And this matches all 4 cases properly.
      [ "BKINFO { ANY * $inside }", 4,8 ],

      // This too
      [ "E $ee { BKINFO { ANY * $inside } }", 4,16 ],

      // This gets all 4 hits, but author3 in ee, since the ANY
      // isn't repeating. Fine!
      [ "E $ee { BKINFO { ANY ? $inside } }", 4,15 ],

      // Adding a group makes no difference
      [ "E $ee { BKINFO { (ANY * $inside) } }", 4,16 ],
      // And the optionality can be on the group as well
      [ "E $ee { BKINFO { (ANY $inside)* } }", 4,16 ],

      // And now finally the actual case that triggered bug 4508
      [ "E $ee { BKINFO { ( btl $title | aug  $author | any ) * } }", 4,16 ],
      
      
    ]
  ], // bug-4508-B
  // testing the regexp match in #text nodes (bug 4552)
  // for simplicity, all hits are between <hr/> and <p/>
  [ "textmatch",
    "<h1>Page title</h1>" +
    "<hr/>First title <br/>By First author <br/>Date: 2001<p/>" +
    "<hr/>Second title <br/>Date: 2002<p/>" +
    "<hr/>Third title <br/>By Third author<p/> " +
    "<hr/>By fourth author<br/>Date: 2004<p/> "+
    "<hr/>Fifth title <p/>" +
    "",
    [
      // Simple pattern, gets only the first
      [ "HR: #text $title : BR : #text $author : BR : #text $date : P" , 1, 6 ],
      // This matches the first 4, including a title "By fourth author"
      [ "HR: #text $title : BR : #text $author : BR ? : #text ? $date : P" , 4, 18 ],
      // This tries to limit the matches by patterns
      [ "HR: #text $title : BR : #text $author [ /By/ ] " , 2, 8 ],
      // Same, with optional author
      [ "HR: #text $title : BR : #text ? $author [ /By/ ] " , 4, 12 ],
      // and with date too, optionals in BRs as well
      // Gets all 5 hits, but a title "By fourth author"
      [ "HR: #text $title : BR ? : #text ? $author [ /By/ ] : "+
              " BR ? : #text ? $date [ /Date/ ]  " , 5, 16 ],
      // Even this gets the fourth wrong
      [ "HR: #text ? $title : BR ? : #text ? $author [ /By/ ] : "+
              " BR ? : #text ? $date [ /Date/ ] : P  " , 5, 16],

      // This gets 2 hits, first and second.
      // The fourth is missing, as it does not have a title.
      [ "HR: #text $title ! #text [/By/] : BR ?: #text ? $author : " +
        "BR : #text $date [ /Date/ ]: P" , 2, 10 ],

      // The BR tends to eat the #text nodes, but if they are first in a group,
      // things work better. This gets three hits (all that end in date),
      // with proper titles or authors.
      [ "HR : ( #text  $title ! #text [/By/] : BR )? : ( #text $author : BR  )? : " +
        " #text $date [ /Date/ ]: P " , 3, 14 ],

      // Trying to explicitly avoid getting the author in the title
      [ "HR: #text ? $title ! #text [ /By/ ]: BR ? : #text ? $author [ /By/ ] : "+
        " BR ? : #text ? $date [ /Date/ ] : P  " , 5, 14 ],
      // This used to fail, gives an empty hit for the fourth. Bug 4554
      // (There was a long side track of failed experiments. The problem was found
      // to be in the BRs, which ate the text in a bit too greedy way. See bug 4508.
      // Fixed in the matchUnconditional fix in CP-3451
        
      // Finally, this gets them all. All BRs must be optional, since they can be missing
      // in various places (fifth and third)
      [ "HR : ( #text  $title ! #text [/By/] : BR? )? : ( #text $author [/By/]: BR?  )? : " +
        " #text ? $date [ /Date/ ]: P " , 5, 20 ],

      // Trying to fix the same with an OR-bag
      // Ignoring the BRs in between. This works pretty well too. The #text nodes
      // have to be in the beginning of the OR list, and the ones with patterns
      // need to be before the one without a pattern (title)
      [ "HR : ( #text $author [ /By/ ] | #text $date [ /Date/ ] | " +
               "#text $title | BR ) + : P", 5, 20 ],
    ]
  ], // textmatch
  // Bug CP-3396: Text mismatch
  // Original pattern in the bug was:
  //    tr { td : td $author { ( #text ? $callno [ /Call / ] ) $item } } :
  [ "Bug-3396-textnode",
    "<h1>Page title</h1>" +
    "<table><TR>" +
    "<TD>nevermind</TD>" +
    "<TD>Call 17 <b>author</b> </TD>" +
    "</TR></table>" +
    "",
    [
      // The original problem
      [ "TR { TD: TD $author { ( #text ? $callno [ /Call / ] ) $item } } ",1, 6 ],
        // This actually works!
        // TODO - This test may be removed after the first release in 2013. The
        // bug has been fixed before it was reported.
    ]
  ],
 
  // Similar to textmatch above, but with #comment nodes
  [ "commentmatch",
    "<h1>Page title</h1>" +
    "<hr/><!--First title--><br/><!--By First author--><br/><!--Date: 2001--><p/>" +
    "<hr/><!--Second title--><br/><!--Date: 2002--><p/>" +
    "<hr/><!--Third title--><br/><!--By Third author--><p/> " +
    "<hr/><!--By fourth author--><br/><!--Date: 2004--><p/> "+
    "<hr/><!--Fifth title--><p/>" +
    "",
    [
      // Simple pattern, gets only the first
      [ "HR: #comment $title : BR : #comment $author : BR : #comment $date : P" , 1, 6 ],
      // This tries to limit the matches by patterns
      [ "HR: #comment $title : BR : #comment $author [ /By/ ] " , 2, 8 ],
      // Same, with optional author
      [ "HR: #comment $title : BR : #comment ? $author [ /By/ ] " , 4, 12 ],
      // and with date too, optionals in BRs as well
      // Gets all 5 hits, but a title "By fourth author"
      [ "HR: #comment $title : BR ? : #comment ? $author [ /By/ ] : "+
        " BR ? : #comment ? $date [ /Date/ ]  " , 5, 20 ],  // used to be 16. CP-3451

    // Trying to explicitly avoid getting the author in the title
    [ "HR: #comment ? $title ! #comment [ /By/ ]: BR ? : #comment ? $author [ /By/ ] : "+
      " BR ? : #comment ? $date [ /Date/ ] : P  " , 5, 18 ],
    // Bug CP-2546 CP-3451 seems to have fixed this.

    // Trying to fix the same with an OR-bag
    // Ignoring the BRs in between.
    [ "HR : ( #comment $author [ /By/ ] | #comment $date [ /Date/ ] | " +
      "#comment $title | BR ) + : P", 5, 20 ],
    ],
  ], // commentmatch
   
  [ "regex-attr", // bug #4911
    "<h1>test page</h1>" +
      "<span style='foostyle' class='fooclass'>title 5</span><br/>" +
      "<span style='barstyle' class='barclass'>title 6</span><br/>" +
      "<A href='http://somewhere.com/search.asp?author=shakespeare'>Shakespeare</A>" +
      '<a href = "http://somewhere.com/search.asp?title=hamlet">Hamlet</a>' +
      "",
    [
      [ "A [ /Shakespeare/ ]", 1, 0 ],
      [ "A [ /shakespeare/ ]", 0, 0 ],
      [ "A $author [ @href ~ /author=/ ]", 1, 2 ],
      [ "A $title [ @href ~ /title=/ ]", 1, 2 ],
      [ "A [ @href ~ \"/search.asp\?\" ]", 2, 0 ],
    ]
  ], // regex-attr
  
  [ "html-modifier", 
    "<h1>test page</h1>" +
      "<b>first</b><span><u>underlined</u></span>" +
      "<b>second</b><span>Plain text<i>italics</i></span>" +
      "",
    [ // TODO - The test harness does not build us a good page. so we do not
      // really get the innerHTML from our nodes. Need a test connector for that
      // The modifier code falls back to a XML dump of the node, which is quite
      // good enough for here.
      [ "B $title : span $inner -html", 2, 8 ],
    ]
  ], // html-modifier
  [ "xml-modifier",
    "<h1>test page</h1>" +
      "<b>first</b><span><u>underlined</u></span>" +
      "<b>second</b><span>Plain text<i>italics</i><!--comment--></span>" +
      "",
    [
      [ "B $title : span $inner -xml", 2, 8 ],
      [ "B $title : span $inner -xml { ANY+ $inside }", 2, 12 ],
      [ "(B $title : span $inner -xml { ANY+ $inside }) $group", 2, 17 ],
    ]
  ], // xml-modifier
  [ "textcontent-modifier",
    "<h1>test page</h1>" +
      "<b>first</b><span><u>underlined</u></span>" +
      "<b>second</b><span>Plain text <i>italics</i><!--comment--></span>" +
      "",
    [
      [ "B $title : span $inner -textcontent", 2, 8 ],
      [ "B $title : span $inner -textcontent { ANY+ $inside }", 2, 12 ],
    ]
  ], // whitespace-modifier
  [ "textcontent-modifier",
    "<h1>test page</h1>" +
      "<b>first</b><span> <u> underlined  </u>  </span>" +
      "<b>second</b><span>Plain text\n<i> \t italics \n </i>\n  \n<!--comment--></span>" +
      "",
    [
      [ "B $title : span $inner -whitespace", 2, 12],
      [ "B $title : span $inner -whitespace { ANY+ $inside }", 2, 14 ],
    ]
  ], // textcontent-modifier
  // CP-1738: $none gets set
  [ "CP-1738-$none",
    "<h1>test page</h1>" +
      "<i>first</i><span>Some text</span>" +
      "<i>second</i><span>More <b>bold</b> text</span>" +
      "",
    [
      [ "i $title : span $description", 2, 10],
      [ "i $title :span $none", 2, 4],
      [ "i $title :span $description { B $none }", 1, 5],
      [ "i $title :span $description { B ? $none }", 2, 9],
      [ "i $title :span $description { B $none [ /bold/ ] }", 1, 5],
      [ "i $title :span $description { B $none [ /nothing/ ] }", 0, 0],
      [ "i $title :span $description { B ? $none [ /bold/ ] }", 2, 9],
      [ "i $title :span $description { B ? $none [ /nothing/ ] }", 2, 10],
    ]
  ], // textcontent-modifier

]; // testArray

var tests = 0;
var sets = 0;
var knownfails = 0;

// Little helper to remove all whitespace
XpatternTester.prototype.cleanstr = function ( str ) {
  str = str.replace(/[\s]+/g,"");
  return str;
};

// Test one xpattern against a (parsed) doc
XpatternTester.prototype.onePattern = function ( title, doc, patt, exp1, exp2 ) {
  var xp;
  var failmsg = "";
  try {
    var par = new XpatternTextParser(patt);
    xp = par.parse();
    // check that we dump the same string as we parsed, modulo whitespace
    var xpstr = xp.dumpString(-1);
    if ( this.cleanstr(xpstr) != this.cleanstr(patt) ) {
      failmsg = "Test " + title + " FAILED! \n";
      failmsg += "Pattern parsed and dumped different strings\n";
      failmsg += "pattern: '" + patt + "'\n";
      failmsg += "dumped:  '" + xpstr + "'\n";
    }
    var hitsarr = xp.match(doc);
    var hitelems=0;
    for ( var i in hitsarr ) {
        hitelems += hitsarr[i].hits.length;
    }
    if ( (!failmsg) && hitsarr.length != Math.abs(exp1) ) {
      failmsg = "Test " + title + " FAILED! \n" ;
      failmsg += "Expected " + exp1 + " hits, got " +hitsarr.length +
        " (and " + hitelems + "/" + exp2 + " elements)\n";
    }
    if ( (!failmsg) && hitelems != Math.abs(exp2) ) {
      failmsg = "Test " + title + " FAILED! \n" ;
      failmsg += "Got " + exp1 + " hits all right, but " +
        hitelems + " hit elements instead of " + exp2 + "\n";
    }
  } catch (e) {
    failmsg  = "Test " + title + " FAILED! \n" ;
    failmsg += "Parse error (bad test?) " + e + "\n'" + patt + "'\n" ;
    if ( e.fileName ) { // catches syntax errors in code, and other interesting things
        failmsg += "  in " + e.fileName + "  line " + e.lineNumber + "\n";
    }
  }
  if ( failmsg && ( exp1 < 0 || exp2 < 0 ) ) {
    dump("Test " + title + " is a known failure \n");
    knownfails++;
    return true;
  }
  if ( exp1 < 0 || exp2 < 0 ) {
    failmsg = "Test " + title + " was a known failure, but IT WORKED\n";
    // Yes, this is a serious failure. Need to investigate why a test
    // suddenly starts working, and if that is true, fix the test!
  }
  if ( failmsg ) {
    if ( hitsarr ) {
      dump (failmsg);
      for ( var i in hitsarr ) {
        dump("Hit " + i + "\n");
        hitsarr[i].dump();
      }
    } else {
      dump ("no hits to dump \n");
    }
    dump (failmsg);
    dump (patt + "\n");
    var xpstr = "";
    if (xp)
      xp.dumpString(0);
    dump (xpstr);
    
    return false;
  }
  return true;
};

// Run one test
XpatternTester.prototype.oneTest = function ( tst ) {
  var title = tst[0];
  dump("Starting test set '" + title + "'\n" );
  var page = "<html><body>" + tst[1] + "</body></html>";
  var testpatterns = tst[2];
/*  
      var parser = Components.classes["@mozilla.org/xmlextras/domparser;1"]
        .createInstance(Components.interfaces.nsIDOMParser);
      var doc = parser.parseFromString(page, "text/xml" );  // text/html fails!
      doc = doc.documentElement;
*/  
  var doc = xmlHelper.docFromString(page);
  // TODO - For some reason we don't have a proper html document here, just
  // a plain DOM tree, as from XML. Therefore, it does not have innerHTML
  // set, and the test for html-modifier gets funny results
  
  for ( var t in testpatterns ) {
    var tt = testpatterns[t];
    var patt = tt[0];
    var exp1 = tt[1];
    var exp2 = tt[2];
    if ( ! this.onePattern( title + "." + t, doc, patt, exp1, exp2 ) ) {
      return false;
    }
    dump("OK: " + title + "." + t + " " + patt + "\n");
    tests++;
  }
  dump("Test set " + (sets+1) + " '" + title + "' OK\n" );
  sets++;
  return true;
};

// The whole unit testing
XpatternTester.prototype.unitTest = function ( ) {
  dump("Unit test for XpatternTester starting \n");
  tests = 0;
  sets = 0;
  for ( var t in testArray ) {
    if ( !this.oneTest(testArray[t]) )
      return false;
  }
  if ( knownfails )
    dump("XpatternTester: " + (tests-knownfails) + " tests of " + tests +
       " passed (in " + sets + " sets), and " + knownfails + " were known fails. " +
       "That is OK.\n");
  else 
    dump("XpatternTester: All " + tests + " tests in " + sets +
       " sets passed OK!\n");
  return true;
};

