/*
   The goal here is to extract meaningful numbers from citations like:
   http://www.sciencedirect.com/science
        Water Research, Volume 17, Issue 2, 1983, Pages 121-132
   http://psycnet.apa.org/
        Behavioral Neuroscience, Vol 124(5), Oct 2010, 706-709
   http://www.ingentaconnect.com/
        Source: Canadian Journal of Earth Sciences, Volume 47, Number 9, 1 September 2010 , pp. 1269-1275(7)
   http://www.ncbi.nlm.nih.gov/pubmed
        Proc Natl Acad Sci U S A. 2010 Aug 31;107(35):15357-61. Epub 2010 Aug 30.
   http://onlinelibrary.wiley.com/
        BIOLOGICAL REVIEWS Volume 85, Issue 1, February 2010, Pages: 55–110
 */

var EXPORTED_SYMBOLS = ["ExtractChronology"];
Components.utils.import('resource://indexdata/runtime/Step.js');
Components.utils.import('resource://indexdata/runtime/StepError.js');
Components.utils.import('resource://indexdata/util/xmlHelper.js');
Components.utils.import('resource://indexdata/util/xulHelper.js');
Components.utils.import("resource://indexdata/util/logging.js");

var ExtractChronology = function () {
};
ExtractChronology.prototype = new Step();
ExtractChronology.prototype.constructor = ExtractChronology;

ExtractChronology.prototype.init = function() {};

var logger = logging.getLogger();

ExtractChronology.prototype.draw = function(surface) {
    xmlHelper.appendNode(surface, "description",
      'This step takes the "citation" field from records in $.output.results[] and attempts to automatically extract values from the citation to populate the "volume", "issue", "year", and "pages" fields.');
};

ExtractChronology.prototype.run = function (task) {
  var results = task.output.results;
  for (var i in results) {
    var result = results[i];
    var citations = result["citation"];
    if (citations) {
      for (var j in citations) {
        var citation = citations[j];
        ExtractChronology.extract(citation, result);
      }
    }
  }
};

ExtractChronology.regexps =
  [
   // Category, then regexp, then pairs of match-indexes and destination field
   // I have no idea why, but Wiley uses a CTRL-S to separate pages
   [ "pages",  /(\d+)\s*(-|–|&ndash;|)\s*(\d+)/, [1, "page"], [3, "endpage"] ],
   [ "volume", /(\d+),\s*no\.\s*(\d+)/i,           [1, "volume"], [2, "issue"] ],
   [ "volume", /(\d+)\s*\((\d+)\)/,                [1, "volume"], [2, "issue"] ],
   [ "volume", /v(\d+)\s*n(\d+)/,                  [1, "volume"], [2, "issue"] ],
   // Only fall back to 10a-12a page-ranges if there is no all-numeric range
   [ "pages",  /(\d+[a-z]+)\s*(-|–|&ndash;|)\s*(\d+[a-z]+)/i, [1, "page"], [3, "endpage"] ],
   [ "year",   /\b[a-z]{3}\s+(\d{4})\b/i,          [1, "date"] ], // Ignores month
   [ "volume", /(Volume|Vol\.)\s+(\d+)/i,          [2, "volume"] ],
   [ "issue",  /(Issue|No\.)\s+(\d+)/i,            [2, "issue"] ],
   [ "issue",  /(Number)\s+(\d+)/i,                [2, "issue"] ],
   [ "pages",  /\bp\.\s+(\d+)/i,                   [1, "page"] ],
   // Fallback if we can't do any better at guessing a page-range
   [ "pages",  /(\d+)\s*[^a-z0-9,.]\s*(\d+)/i,     [1, "page"], [2, "endpage"] ],
   // Fallback follows if we can't do any better at guessing a year
   [ "year",   /\b[A-Z][a-z]+\s+(\d{4})\b/,        [1, "date"] ],
   [ "pages",  /\bp\W*(\d+)/,                      [1, "page"] ],
   [ "pages",  /,\s*(\d+)/,                        [1, "page"] ],
  ];

/*
 * Strategy: we work our way down the list of pre-cooked regular
 * expressions, trying each one against the input string.  When we get
 * a match, we copy the matching sub-expressions into the specified
 * output fields, remove the whole of the matching string, and make a
 * note that we have a match in the specified category.  Subsequent
 * regular expressions in the same category are skipped, so that we
 * avoid accidentally overwriting a good match with a less good one.
 */
ExtractChronology.extract = function(s, rec) {
  var categoriesSeen = [];
  var data = [];

  for (var i = 0; i < ExtractChronology.regexps.length; i++) {
    var ref = ExtractChronology.regexps[i];
    var cat = ref[0];
    var matches = ref[1].exec(s);
    if (categoriesSeen[cat]) {
      //dump("skipping " + ref[1] + " -- already seen category '" + cat + "'\n");
    } else if (!matches) {
      //dump("'" + s + "' does not match " + ref[1] + "\n");
    } else {
      categoriesSeen[cat] = 1;
      //dump("'" + s + "' matches " + ref[1] + "\n");
      s = s.replace(ref[1], "");
      for (var j = 2; j < ref.length; j++) {
        var pair = ref[j];
        data[pair[1]] = matches[pair[0]];
        //dump("set '" + pair[1] + "' to match #" + pair[0] + " = '" + matches[pair[0]]+ "'\n");
      }
    }
  }

  for (var key in data) {
    if (rec[key] === undefined) {
      rec[key] = [];
    }
    rec[key].push(data[key]);
  }
}

ExtractChronology.prototype.getClassName = function () {
  return "ExtractChronology";
};

ExtractChronology.prototype.getDisplayName = function () {
  return "Extract chronology";
};

ExtractChronology.prototype.getDescription = function () {
  return "Extracts chronology data (volume, issue, page-range, year) " +
         "from a nominated field, ususally 'citation', and copies them " +
         "into individual fields.";
};

ExtractChronology.prototype.getVersion = function () {
  return "0.2";
};

ExtractChronology.prototype.renderArgs = function () {
  return "";
};

ExtractChronology.prototype.upgrade = function (confVer, curVer, conf) {
  // can't upgrade if the connector is newer than the step
  if (confVer > curVer)
    return false;

  if (confVer < 0.2) {
    this.conf = {};
  }
  return true;
};

ExtractChronology.prototype.capabilityFlagDefault = function ( flag ) {
  if (flag == "result-volume" ||
      flag == "result-issue" ||
      flag == "result-year" ||
      flag == "result-page" ||
      flag == "result-endpage" )
    return true;
  return null;
};

ExtractChronology.prototype.unitTest = function () {
    const samples = [
        [ "Water Research, Volume 17, Issue 2, 1983, Pages 121-132",
	  { volume:17, issue:2, page:121, endpage:132 } ], /* Why no date? */
        [ "Behavioral Neuroscience, Vol 124(5), Oct 2010, 706-709",
	  { volume:124, issue:5, date:2010, page:706, endpage:709 } ],
        [ "Proc Natl Acad Sci U S A. 2010 Aug 31;107(35):15357-61. Epub 2010 Aug 30.",
	  { volume:107, issue:35, date:2010, page:15357, endpage:61 } ],
        [ "BIOLOGICAL REVIEWS Volume 85, Issue 1, February 2010, Pages: 55–110",
	  { volume:85, issue:1, date:2010, page:55, endpage:110 } ],
        [ "Source: Canadian Journal of Earth Sciences, Volume 47, Number 9, 1 September 2010 , pp. 1269-1275(7)",
	  { volume:47, issue:9, date:2010, page:1269, endpage:1275 } ],
	[ "Psychiatric News, Sep 2012; 477 (20); -. doi: 10.1176/appi.pn.2012.11a1",
	  { volume:477, issue:20, date:2012 } ],
	[ "Psychiatric News, Sep 2012; 47 (18); 15a-15a",
	  { volume:47, issue:18, date:2012, page:"15a", endpage:"15a" } ],
	[ "Acta Obstetrica et Gynaecologica Japonica 62(2), 794",
	  { volume:62, issue:2, page:794 } ],
	[ "Learning & Leading with Technology, v38 n1 p20-23 Aug 2010",
	  { volume:38, issue:1, date:2010, page:20, endpage:23 } ],
    ];

    var ok = true;

    for (var i = 0; i < samples.length; i++) {
	var ref = samples[i];
	var citation = ref[0];
	var expected = ref[1];
	var result = {};
        ExtractChronology.extract(citation, result);

	if (ExtractChronology.record_matches(result, expected)) {
	    logger.info("parsed '" + citation + "' correctly");
	} else {
	    logger.error("parsed '" + citation + "' to '" + JSON.stringify(result) + "' (expected '" + JSON.stringify(expected) + "')");
	    ok = false;
	}
    }

    return ok;
};

ExtractChronology.record_matches = function(result, expected) {
    // In the result record, fields are single-element arrays
    // containing strings. For simplicity, we express the expected
    // record are scalars which may be integers. We are permissive in
    // comparing these.

    var ok = true;
    var seen = {};
    for (var key in result) {
	seen[key] = 1;
    }

    for (var key in expected) {
	if (result[key] === undefined) {
	    logger.warn("key '" + key + "' not defined in result");
	    ok = false;
	} else if (result[key][0] != expected[key]) {
	    logger.warn("key '" + key + "': expected '" + expected[key] + "', got '" + result[key][0] + "'");
	    ok = false;
	} else {
	    logger.debug("key '" + key + "' is good (" + expected[key] + ")");
	}
	seen[key] = 0;
    }

    for (var key in seen) {
	if (seen[key]) {
	    logger.warn("unexpected key '" + key + "'");
	    ok = false;
	}
    }

    return ok;
}

