@InProceedings{daume09streaming,
  author =       {Amit Goyal and Hal {Daum\'e III} and Suresh Venkatasubramanian},
  title =        {Streaming for Large Scale {NLP}: Language Modeling},
  booktitle =    {North American Chapter of the Association for Computational Linguistics (NAACL)},
  year =         {2009},
  address =      {Boulder, CO},
  abstract =     {
    In this paper, we explore a streaming algorithm paradigm to handle
    large amounts of data for NLP problems. We present an efficient
    low-memory method for constructing high-order approximate n-gram
    frequency counts. The method is based on a deterministic streaming
    algorithm which efficiently computes approximate frequency counts
    over a stream of data while employing a small memory footprint. We
    show that this method easily scales to billion-word monolingual
    corpora using a conventional (4 GB RAM) desktop
    machine. Statistical machine translation experimental results
    corroborate that the resulting high-n approximate small language
    model is as effective as models obtained from other count pruning
    methods.
  },
  url = {http://pub.hal3.name/#daume09streaming}
}

