textmill.js

Go to the documentation of this file.
00001 /***
00002 TextMill: A flexible system for text processing.
00003 Copyright 2008 Michael Wojcik, wojcikm4@msu.edu.
00004 ***/
00005 
00006 
00007 /* Create the TextMill namespace */
00008 var TextMill = {};
00009 
00010 
00011 /***
00012 TMLog: A simple debugging function. By default it does nothing.
00013 To use it under the Javascript Shell, for example, just execute
00014 "TextMill.TMLogger = print". This function isn't in the TextMill
00015 namespace for convenience.
00016 
00017 TODO: Make this write to the console by default.
00018 ***/
00019 
00020 function TMLog(s)
00021 {
00022    if (TextMill.TMLogger) TextMill.TMLogger("TextMill: " + s);
00023 }
00024 
00025 
00026 /***
00027 TextMill's toString describes the current state of TextMill.
00028 
00029 TODO: This is probably overly verbose for toString, though it's
00030 convenient during development. Should have a shorter toString and
00031 make this a convenience function (which can always replace the
00032 default toString).
00033 ***/
00034 
00035 TextMill.toString = function()
00036 {
00037    var desc = new String("TextMill:\n");
00038 
show video 00040 if (this.modules) 00041 { 00042 desc += " Modules:\n"; 00043 for (var module in this.modules) 00044 { 00045 desc += " " + module + ":\n"; 00046 00047 /*** 00048 List Makers known in this module. They're stored as properties of 00049 the makers property of this module's namespace. 00050 ***/ 00051 if (this.modules[module].makers) 00052 for (var maker in this.modules[module].makers) 00053 desc += " " + maker + "\n"; 00054 00055 /* List Takers stored under this module's name */ 00056 if (this.modules[module].takers) 00057 for (var taker in this.modules[module].takers) 00058 desc += " Take" + taker + "\n"; 00059 00060 /* If this module has a Starter, show that */ 00061 if (this.modules[module].Start && 00062 typeof this.modules[module].Start == "function") 00063 desc += " Start\n"; 00064 00065 /* And the same for a Finisher */ 00066 if (this.modules[module].Finish && 00067 typeof this.modules[module].Finish == "function") 00068 desc += " Finish\n"; 00069 } 00070 } 00071 00072 if (this.multiTexts.length > 0) 00073 { 00074 desc += " MultiTexts:\n"; 00075 for (var mtidx = 0; mtidx < this.multiTexts.length; mtidx++) 00076 { 00077 desc += " " + mtidx + ":\n"; 00078 var mt = this.multiTexts[mtidx]; 00079 for (var transtype in mt.forms) 00080 { 00081 for (var tidx = 0; tidx < mt.forms[transtype].length; tidx++) 00082 { 00083 desc += 00084 " " 00085 + transtype 00086 + " (" 00087 + mt.forms[transtype][tidx].creator 00088 + ")\n"; 00089 } 00090 } 00091 } 00092 } 00093 00094 if (this.sources) 00095 { 00096 desc += " Sources:\n"; 00097 for (var source in this.sources) 00098 { 00099 desc += 00100 " " 00101 + this.sources[source].name 00102 + "{" 00103 + this.sources[source].standing.confidence 00104 + ", " 00105 + this.sources[source].standing.strength 00106 + "}\n"; 00107 } 00108 }
00110 00111 return desc; 00112 };
00113 
00114 
00115 /***
00116 TextMill's regular expression for maker / taker functions, used in
00117 various places.
00118 ***/
00119 
00120 TextMill.mtre = /^[MT]ake/;
00121 
00122 
00123 /***
00124 Sources
00125 
00126 Define the prototype for a Source, and a container of Sources. Sources are
00127 things that can have ethetic standing.
00128 ***/
00129 
00130 TextMill.sources = null;
00131 
00132 function Source(name)
00133 {
00134    var source =
00135    {
00136       name: name,
00137       standing:
00138       {
00139          confidence: 0.0,
00140          strength: 0.0,
00141          ratings: 0
00142       },
00143 
00144       updateStanding: function(conf, str)
00145       {
00146          /***
00147          This is a dumbed-down fuzzy logic. It needs to be replaced with a
00148          real implementation.
00149          
00150          TODO: Put in a real implementation.
00151          ***/
00152 
00153          var standing = this.standing;
00154 
00155          if (standing.ratings == 0)
00156          {
00157             /* Our first rating; take it as given */
00158             standing.confidence = conf;
00159             standing.strength = str;
00160             standing.ratings = 1.0;
00161             TMLog
00162             (
00163                this.name
00164              + " confidence: "
00165              + standing.confidence
00166              + ", strength: "
00167              + standing.strength
00168             );
00169          }
00170 
00171          else
00172          {
00173             /* Increase confidence */
00174             var newConf, newConf1, newConf2;
00175             newConf1 =
00176                (standing.confidence * standing.ratings + conf) /
00177                (standing.ratings + 1.0);
00178             newConf2 =
00179                standing.confidence + (1.0 - standing.confidence) * conf;
00180             newConf = newConf1 > newConf2? newConf1 : newConf2;
00181             if (newConf > standing.confidence)
00182             {
00183                TMLog
00184                (
00185                   this.name
00186                 + " confidence: "
00187                 + standing.confidence
00188                 + " to "
00189                 + newConf
00190                );
00191                standing.confidence = newConf;
00192             }
00193 
00194             /* An increase in strength? */
00195             if (str > standing.strength)
00196             {
00197                var newStr = standing.strength +
00198                             (1.0 - standing.strength) * (str * conf);
00199                TMLog
00200                (
00201                   this.name
00202                 + " strength: "
00203                 + standing.strength
00204                 + " to "
00205                 + newStr
00206                );
00207                standing.strength = newStr;
00208             }
00209 
00210             standing.ratings += 1.0;
00211          }
00212       }
00213    };
00214 
00215    TMLog("Created source \"" + name + "\"");
00216 
00217    if (! TextMill.sources) TextMill.sources = new Object();
00218    TextMill.sources[name] = source;
00219    TextMill.stateChanged = true;
00220    return source;
00221 }
00222 
00223 
00224 /***
00225 MultiTexts
00226 
00227 Define the prototype for a MultiText, and an array of MultiTexts.
00228 ***/
00229 
00230 TextMill.multiTexts = new Array();
00231 
00232 function MultiText(creatorName)
00233 {
00234    /* Create a MultiText */
00235    var mt =
00236    {
00237       creator: creatorName,   /* name of creating module */
00238       form:    {},      /* default transformations of various types */
00239       forms:   {}       /* arrays of transformations of various types */
00240    };
00241 
00242    /* Add it to the array */
00243    TextMill.multiTexts.push(mt);
00244    TextMill.stateChanged = true;
00245    return mt;
00246 }
00247 
00248 
00249 /***
00250 Transforms
00251 
00252 Define the prototype for a Transform, which is what the MultiText form array,
00253 and the arrays in the MultiText forms array, contain. This contains the
00254 data of the transform, the type of the transform, the name of the creating
00255 module, and an object that will contain flags indicating which takers have
00256 seen this transformation and are done with it.
00257 ***/
00258 
00259 function Transform(data, type, creatorName)
00260 {
00261    var tr =
00262    {
00263       data: data,
00264       type: type,
00265       creator: creatorName,
00266       takenBy: {}
00267    };
00268    return tr;
00269 }
00270 
00271 
00272 /***
00273 Add a module to TextMill. The module should be an object with one
00274 or more methods named "Make<something>" or "Take<something>", and
00275 a property "name" that evaluates to a short string suitable as a
00276 property name. This will serve as the module's namespace.
00277 ***/
00278 
00279 TextMill.addModule = function(module)
00280 {
00281    /* Create the TextMill module containers if necessary */
00282    if (! TextMill.modules)  TextMill.modules = new Object();
00283    if (! TextMill.makers)   TextMill.makers  = new Object();
00284    if (! TextMill.takers)   TextMill.takers  = new Object();
00285 
00286    /* Create the module's namespace within the TextMill namespace */
00287    /* TODO: check for namespace collision */
00288    var name = module.name;
00289    TextMill.modules[name] = {};
00290 
00291    var ns = TextMill.modules[name];
00292    ns.module = module;
00293    ns.makers = new Object();
00294    ns.takers = new Object();
00295 
00296    /* Look for methods to add to the namespace */
00297    for (var propname in module)
00298    {
00299       var prop = module[propname];
00300 
00301       /* Only interested in functions */
00302       if (typeof prop != "function") continue;
00303 
00304       /* See if this function is a TextMill affordance */
00305       
00306       /* Is it a Maker or a Taker? */
00307       if (propname.match(TextMill.mtre))
00308       {
00309          TMLog("Found " + propname);
00310          ns[propname] = prop;          /* Record this method */
00311 
00312          /* What are we making or taking? */
00313          var productType = propname.replace(TextMill.mtre, "");
00314 
00315          if (propname.match(/^M/))     /* is it a Maker? */
00316          {
00317             /***
00318             Do we already have a Maker of this type, or do we need to create
00319             one?
00320             ***/
00321 
00322             if (! TextMill.makers[productType])
00323             {
00324                /* Create the Maker */
00325 
00326                /* Note we need a closure here to capture productType */
00327                TextMill.makers[productType] = function(pType)
00328                {return function(multiText, data)
00329                {
00330                   var text = data.toString();
00331                   TMLog
00332                   (
00333                      "Making "
00334                    + pType
00335                    + " ("
00336                    + text.substring(0,text.indexOf("\n"))
00337                    + "...)"
00338                   );
00339 
00340                   /***
00341                   Are we creating a new MultiText and attaching this transform
00342                   to it as its first form, or are we attaching the transform 
00343                   to an existing MultiText?
00344                   ***/
00345 
00346                   if (! multiText) multiText = new MultiText(this.name);
00347 
00348                   /* Wrap the transform in an identifying object */
00349                   var transform = new Transform(data, pType, this.name);
00350 
00351                   /***
00352                   Create an array of transforms of this type if it doesn't
00353                   already exist. Add the transform to the array.
00354                   ***/
00355 
00356                   if (! multiText.forms[pType])
00357                      multiText.forms[pType] = new Array();
00358                   multiText.forms[pType].push(transform);
00359 
00360                   /***
00361                   The transform also becomes the new default transform of
00362                   this type.
00363                   ***/
00364 
00365                   multiText.form[pType] = transform;
00366 
00367                   /* Indicate state has changed */
00368                   TextMill.stateChanged = true;
00369 
00370                   return multiText;
00371                }}(productType)
00372             }
00373             var maker = TextMill.makers[productType];
00374 
00375             /* Record this Maker in the Maker-container for this module ... */
00376             ns.makers[propname] = maker;
00377 
00378             /* ... and replace the module's stub with our Maker */
00379             module[propname] = maker;
00380          }
00381 
00382          else
00383          {
00384             /* Record this Taker in the module's Taker-container... */
00385             ns.takers[productType] = prop;
00386 
00387             /* ... and in TM's array of Takers of this type */
00388             if (! TextMill.takers[productType])
00389                TextMill.takers[productType] = new Array();
00390             var taker = {taker: prop, name: name};
00391             TextMill.takers[productType].push(taker);
00392          }
00393       }
00394 
00395       else if (propname == "Start")
00396       {
00397          /* Record this Starter in TM's array of Starters */
00398          TMLog("Found " + propname);
00399          ns[propname] = prop;          /* Record this method */
00400          if (! TextMill.starters) TextMill.starters = new Array();
00401          TextMill.starters.push(name);
00402       }
00403 
00404       else if (propname == "Finish")
00405       {
00406          /* Record this Finisher in TM's array of Finishers */
00407          TMLog("Found " + propname);
00408          ns[propname] = prop;          /* Record this method */
00409          if (! TextMill.finishers) TextMill.finishers = new Array();
00410          TextMill.finishers.push(name);
00411       }
00412    }
00413 };
00414 
00415 
00416 /***
00417 The TextMill run function.
00418 ***/
00419 
00420 TextMill.run = function()
00421 {
00422    /* Run starters */
00423    if (this.starters) for (var sidx = 0; sidx < this.starters.length; sidx++)
00424       this.modules[this.starters[sidx]].module.Start();
00425 
00426    /***
00427    Cycle until no changes are made. On each cycle, iterate:
00428    
00429    - for each MultiText
00430       - for each transform
00431          - for each taker for that transform type
00432             - if that taker has not already seen that transform, or if it has
00433               but it indicated it wants to see it again, invoke it
00434                - if the taker returns true ("done with this"), record the that
00435                  fact that this transform has been taken by this taker
00436 
00437    TextMill will set its "continue" flag if any of its state-changing methods
00438    are invoked (MakeXXX, AddXXX, etc).
00439 
00440    Start another cycle if the continue flag is set.
00441    ***/
00442 
00443    var multiTexts = this.multiTexts,
00444        takers = this.takers;
00445 
00446    if (multiTexts && takers) do
00447    {
00448       this.stateChanged = false;
00449 
00450       for (var mtidx = 0; mtidx < multiTexts.length; mtidx++)
00451       {
00452          var mt = multiTexts[mtidx];
00453          for (var transtype in mt.forms)
00454          {
00455             for (var tidx = 0; tidx < mt.forms[transtype].length; tidx++)
00456             {
00457                var transform = mt.forms[transtype][tidx];
00458                var tTakers = takers[transform.type];
00459                if (tTakers) for (var tkidx = 0; tkidx < tTakers.length; tkidx++)
00460                {
00461                   var modName = tTakers[tkidx].name;
00462                   if (! transform.takenBy[modName])
00463                   {
00464                      /***
00465                      TODO: We don't want to save references to the actual
00466                      methods in the modules, because then they get invoked
00467                      in the wrong object context. We want to save just a
00468                      reference to the module object itself, and invoke
00469                      through that.
00470                      ***/
00471 
00472                      var mod = this.modules[modName].module;
00473                      var taken = tTakers[tkidx].taker
00474                      (
00475                         mod               /* the module itself */
00476                       , transform.data    /* the transform's data */
00477                       , mt                /* the multitext */
00478                      );
00479                      transform.takenBy[modName] = taken;
00480                   }
00481                }
00482             }
00483          }
00484       }
00485    }
00486    while (this.stateChanged);
00487 
00488    /* Run finishers */
00489    if (this.finishers) for (var sidx = 0; sidx < this.finishers.length; sidx++)
00490       this.modules[this.finishers[sidx]].module.Finish();
00491 
00492    return true;         /* successful */
00493 };
00494 

Based on content generated using Doxygen