tokentagging_fu.js

/**
 * @class TokenTagging
 * @classdesc concrete.js extensions to the TokenTagging class
 */


/**
 * Create a valid TokenTagging with required fields AnnotationMetadata and UUID
 *
 * Example usage:
 *
 *     tt = TokenTagging.create({taggingType: 'NER'}, {tool: 'HIT'})
 *
 * @param {Object} options - Override default TokenTagging fields (except metadata)
 * @param {Object} metadataOptions - Override default tokenTagging.metadata fields
 */
TokenTagging.create = function(options, metadataOptions) {
  var tokenTagging = new TokenTagging();
  tokenTagging.metadata = new AnnotationMetadata();
  tokenTagging.metadata.timestamp = Math.floor(Date.now()/1000);
  tokenTagging.metadata.tool = 'concrete.js - TokenTagging.create()';
  tokenTagging.taggedTokenList = [];
  tokenTagging.taggingType = '';
  tokenTagging.uuid = concrete.util.generateUUID();

  tokenTagging = $.extend({}, tokenTagging, options);
  tokenTagging.metadata = $.extend({}, tokenTagging.metadata, metadataOptions);
  return tokenTagging;
};

/**
 * Get BIO value for TaggedToken at tokenIndex
 *
 * @param {Integer] tokenIndex
 * @returns {String|null} - 'B', 'I', 'O' or null
 */
TokenTagging.prototype.bioGetBIOValue = function(tokenIndex) {
  var taggedToken = this.getTaggedTokenWithTokenIndex(tokenIndex);
  if (taggedToken && taggedToken.tag) {
    var firstChar = taggedToken.tag.charAt(0);
    if (firstChar === 'B' || firstChar === 'I' || firstChar === 'O' ) {
      return firstChar;
    }
  }
  return null;
};

/**
 * Get tag value (stripped of BIO tag and separator) for TaggedToken at tokenIndex
 *
 * @param {Integer] tokenIndex
 * @returns {String|null} - 'B', 'I', 'O' or null
 *
 */
TokenTagging.prototype.bioGetTagValue = function(tokenIndex) {
  var taggedToken = this.getTaggedTokenWithTokenIndex(tokenIndex);
  if (taggedToken && taggedToken.tag) {
    return taggedToken.tag.substring(2);
  }
  return null;
};

/**
 * Returns separator character for BIO TokenTaggings.
 *
 * If the separator character had not been set before this function was called,
 * the separator character will be set to '-'.
 *
 * @returns {String} - Separator character for BIO TokenTaggings
 */
TokenTagging.prototype.bioGetTagSeparator = function() {
  if (this.bioTagSeparator === undefined) {
    this.bioTagSeparator = '-';
  }
  return this.bioTagSeparator;
};

/**
 * Returns token index of 'B' tag for the (possibly multi-token) 'BI'
 * tagging at the specified tokenIndex.
 *
 * If the tag at tokenIndex is a 'B' tag, return tokenIndex.  If the
 * tag at tokenIndex is an 'I' tag, find the index of the 'B' tag for
 * this 'I' tag.
 *
 * @param {Number} tokenIndex -
 * @returns {Number} - Token index of "B" tag
 * @throws {TypeError} Thrown if the tag at TokenIndex is not a 'B' or
 *                     'I' tag.  Also thrown if the tag at TokenIndex
 *                     is a valid 'I' tag, but not part of a valid 'BI*'
 *                     multi-token tagging.
 */
TokenTagging.prototype.bioGetTokenIndexForB = function(tokenIndex) {
  if (this.bioGetBIOValue(tokenIndex) !== 'B' && this.bioGetBIOValue(tokenIndex) !== 'I') {
    throw new TypeError("TokenTagging.getBIOTokenIndexForB expected a 'B' or 'I' tag at tokenIndex " + tokenIndex);
  }

  var bTokenIndex = tokenIndex;
  while (this.bioGetBIOValue(bTokenIndex) === 'I') {
    bTokenIndex -= 1;
  }
  if (this.bioGetBIOValue(bTokenIndex) !== 'B') {
    throw new TypeError("TokenTagging.getBIOTokenIndex expected a 'B' tag at tokenIndex " +
                        bTokenIndex + ', but tag was "' +
                        this.getTaggedTokenWithTokenIndex(bTokenIndex) + "'");
  }
  return bTokenIndex;
};

/**
 * Set BIO TaggedToken tag
 *
 * @param {String} bioValue - Should be 'B', 'I' or 'O'
 * @param {String} tagText
 * @param {Number} tokenIndex
 * @throws {TypeError} Thrown if bioValue is not 'B'|'I'|'O'.  Also thrown
 *                     if bioValue is 'I', but not part of a valid 'BI*'
 *                     multi-token tagging.
 */
TokenTagging.prototype.bioSetTaggedTokenTag = function(bioValue, tagText, tokenIndex) {
  if (bioValue !== 'B' && bioValue !== 'I' && bioValue !== 'O') {
    throw new TypeError("TokenTagging.bioSetTaggedTokenTag() expected bioValue to be 'B', 'I' or 'O', " +
                        "but instead it was '" + bioValue + "'");
  }

  var bioTagText;
  if (bioValue === 'B') {
    this.setTaggedTokenTag(bioValue + this.bioGetTagSeparator() + tagText, tokenIndex);
  }
  else if (bioValue === 'I') {
    var bioPreviousValue = this.bioGetBIOValue(tokenIndex-1);
    if (bioPreviousValue === 'B' || bioPreviousValue === 'I') {
      // Get tag value from previous tag, ignore 'tagText' passed into function
      this.setTaggedTokenTag(bioValue + this.bioGetTagSeparator() + this.bioGetTagValue(tokenIndex-1), tokenIndex);
    }
    else {
      throw new TypeError("TokenTagging.bioSetTaggedTokenTag() encountered inconsistent BIO tagging " +
                          "at tokenIndex " + (tokenIndex-1));
    }
  }
  else {
    this.setTaggedTokenTag('O', tokenIndex);
  }

  var bioValueNext = this.bioGetBIOValue(tokenIndex+1);
  var tagTextNext = this.bioGetTagValue(tokenIndex+1);
  if (bioValueNext === 'I') {
    if (bioValue === 'O') {
      this.bioSetTaggedTokenTag('B', tagTextNext, tokenIndex+1);
    }
    else {
      if (tagText !== tagTextNext) {
        // Update tagText for all following 'I' tokens
        this.bioSetTaggedTokenTag('I', tagText, tokenIndex+1);
      }
    }
  }
};

/**
 * For BIO TokenTaggings, sets separator character to be used between
 * B/I/O character and rest of tag
 *
 * @param {String} separator - String used as separator character
 */
TokenTagging.prototype.bioSetTagSeparator = function(separator) {
  this.bioTagSeparator = separator;
};

/**
 * Return a deep copy of this TokenTagging's taggedTokenList.
 *
 * @returns {TaggedToken[]}
 */
TokenTagging.prototype.deepCopyTaggedTokenList = function() {
  var taggedTokenListCopy = [];
  for (var i = 0; i < this.taggedTokenList.length; i++) {
    var taggedToken = new TaggedToken();
    taggedTokenListCopy.push(jQuery.extend(true, taggedToken, this.taggedTokenList[i]));
  }
  return taggedTokenListCopy;
};

/**
 * Return the TaggedToken (or null) with the specified tokenIndex
 *
 * @param {Number} tokenIndex
 * @returns {TaggedToken|null}
 */
TokenTagging.prototype.getTaggedTokenWithTokenIndex = function(tokenIndex) {
  for (var i = 0; i < this.taggedTokenList.length; i++) {
    if (this.taggedTokenList[i].tokenIndex === tokenIndex) {
      return this.taggedTokenList[i];
    }
  }
  return null;
};

/**
 * Set taggedTokenList to a list of TaggedTokens (one per token) with identical tags
 *
 * @param {Tokenization} tokenization - Used to determine # of TokenTags
 * @param {String} tagText - Value for each TaggedToken's "tag" field
 */
TokenTagging.prototype.setAllTaggedTokenTags = function(tokenization, tagText) {
  // Discard the contents of the existing taggedTokenList
  this.taggedTokenList = [];

  for (var i = 0; i < tokenization.tokenList.tokenList.length; i++) {
    taggedToken = new TaggedToken();
    taggedToken.tag = tagText;
    taggedToken.tokenIndex = i;
    this.taggedTokenList.push(taggedToken);
  }
};

/**
 * Sets the tag of the TaggedToken with the specified tokenIndex.
 * If a TaggedToken with the specified tokenIndex does not exist,
 * than it will be created.
 *
 * @param {String} tagText
 * @param {Number} tokenIndex
 */
TokenTagging.prototype.setTaggedTokenTag = function(tagText, tokenIndex) {
  var taggedToken = this.getTaggedTokenWithTokenIndex(tokenIndex);
  if (!taggedToken) {
    taggedToken = new TaggedToken();
    taggedToken.tokenIndex = tokenIndex;
    this.taggedTokenList.push(taggedToken);
  }
  taggedToken.tag = tagText;
};