so have 1 large file contains bunch of weather data. have allocate each line large file corresponding state file. there total of 50 new state files own data.
the large file contains ~1 million lines of records this:
coop:166657,'new iberia airport acadiana regional la us',200001,177,553
although name of station can vary , have different number of words.
currently right using regex find pattern , output file, , must grouped state. if read in entire file without modifications takes 46 seconds. code find state abbreviation, create file, , output file, takes on 10 minutes.
this have right now:
package climate; import java.io.bufferedreader; import java.io.file; import java.io.filereader; import java.io.filewriter; import java.io.ioexception; import java.util.arrays; import java.util.scanner; import java.util.regex.matcher; import java.util.regex.pattern; /** * program read in large file containing many stations , states, * , output in order stations corresponding state file. * * note: take long time depending on processor. appends data * files must remove state files in current directory * before running accuracy. * * @author marcus * */ public class climatecleanstates { public static void main(string[] args) throws ioexception { scanner in = new scanner(system.in); system.out .println("note: program can take long time depending on processor."); system.out .println("it not necessary run state files in directory."); system.out .println("but if see how works, may continue."); system.out.println("please remove state files before running."); system.out.println("\nis states directory empty?"); string answer = in.nextline(); if (answer.equals("n")) { system.exit(0); in.close(); } system.out.println("would run program?"); string answer2 = in.nextline(); if (answer2.equals("n")) { system.exit(0); in.close(); } string[] statesspaced = new string[51]; file statefile, dir, infile; // create files each states dir = new file("states"); dir.mkdir(); infile = new file("climatedata.csv"); filereader fr = new filereader(infile); bufferedreader br = new bufferedreader(fr); string line; line = br.readline(); system.out.println(); // read in climatedata.csv final long start = system.currenttimemillis(); while ((line = br.readline()) != null) { // remove instances of -9999 if (!line.contains("-9999")) { string statefilename = null; pattern p = pattern.compile(".* ([a-z][a-z]) us"); matcher m = p.matcher(line); if (m.find()){ statefilename = m.group(1); statefilename = "states/" + statefilename + ".csv"; statefile = new file(statefilename); filewriter statewriter = new filewriter(statefile, true); statewriter.write(line + "\n"); // progress reporting //system.out.printf("writing [%s] file [%s]\n", line, // statefile); statewriter.flush(); statewriter.close(); } } } system.out.println("elapsed " + (system.currenttimemillis() - start) + " ms"); br.close(); fr.close(); in.close(); } }
you can use map
keep track of state files rather closing them each time:
map<string, filewriter> filemap = new hashmap<string, filewriter>(); while ((line = br.readline()) != null) { if (!line.contains("-9999")) { if (m.find()) { statefilename = m.group(1); statefilename = "states/" + statefilename + ".csv"; filewriter statefilewriter = filemap.get(statefilename); if (statefilewriter == null) { statefilewriter = new filewriter(statefilename, true); filemap.put(statefilename, statefilewriter); } statefilewriter.write(line + "\n"); } } } // flush writers , close once have parsed entire file for(map.entry<string, filewriter> entry : filemap.entryset()) { filewriter writer = entry.getvalue(); writer.flush(); writer.close(); }