scala - Wiki xml parser - org.apache.spark.SparkException: Task not serializable -


i newbie both scala , spark, , trying of tutorials, 1 advanced analytics spark. following code supposed work:

import com.cloudera.datascience.common.xmlinputformat import org.apache.hadoop.conf.configuration import org.apache.hadoop.io._ val path = "/home/petr/downloads/wiki/wiki" val conf = new configuration() conf.set(xmlinputformat.start_tag_key, "<page>") conf.set(xmlinputformat.end_tag_key, "</page>") val kvs = sc.newapihadoopfile(path, classof[xmlinputformat], classof[longwritable], classof[text], conf)  val rawxmls = kvs.map(p => p._2.tostring)  import edu.umd.cloud9.collection.wikipedia.language._ import edu.umd.cloud9.collection.wikipedia._  def wikixmltoplaintext(xml: string): option[(string, string)] = { val page = new englishwikipediapage() wikipediapage.readpage(page, xml) if (page.isempty) none else some((page.gettitle, page.getcontent)) }  val plaintext = rawxmls.flatmap(wikixmltoplaintext) 

but gives

scala> val plaintext = rawxmls.flatmap(wikixmltoplaintext) org.apache.spark.sparkexception: task not serializable @ org.apache.spark.util.closurecleaner$.ensureserializable(closurecleaner.scala:166) @ org.apache.spark.util.closurecleaner$.clean(closurecleaner.scala:158) @ org.apache.spark.sparkcontext.clean(sparkcontext.scala:1622) @ org.apache.spark.rdd.rdd.flatmap(rdd.scala:295) ... 

running spark v1.3.0 on local (and have loaded 21mb of wiki articles, test it).

all of https://stackoverflow.com/search?q=org.apache.spark.sparkexception%3a+task+not+serializable didn't me clue...

thanks.

try

    import com.cloudera.datascience.common.xmlinputformat import org.apache.hadoop.conf.configuration import org.apache.hadoop.io._ val path = "/home/terrapin/downloads/enwiki-20150304-pages-articles1.xml-p000000010p000010000" val conf = new configuration() conf.set(xmlinputformat.start_tag_key, "<page>") conf.set(xmlinputformat.end_tag_key, "</page>") val kvs = sc.newapihadoopfile(path, classof[xmlinputformat], classof[longwritable], classof[text], conf) val rawxmls = kvs.map(p => p._2.tostring)   import edu.umd.cloud9.collection.wikipedia.language._ import edu.umd.cloud9.collection.wikipedia._   val plaintext = rawxmls.flatmap{line => val page = new englishwikipediapage() wikipediapage.readpage(page, line) if (page.isempty) none else some((page.gettitle, page.getcontent)) }