阿里面试题:中文乱码处理和大文件计算词频
中文乱码处理示例(讲解见视频)
@Test
public void test_chinese(){
var charset = StandardCharsets.UTF_8;
var decoder = charset.newDecoder();
var bytes = charset.encode("长坂桥头杀气生,横枪立马眼圆睁。一声好似轰雷震,独退曹家百万兵。")
.array();
var bytes2 = Arrays.copyOfRange(bytes, 0, 11);
var bbuf = ByteBuffer.allocate(12);
var cbuf = CharBuffer.allocate(12);
bbuf.put(bytes2);
bbuf.flip();
decoder.decode(bbuf, cbuf, true);
cbuf.flip();
var tmp = new char[cbuf.length()];
while(cbuf.hasRemaining()) {
cbuf.get(tmp);
System.out.println("here:" + new String(tmp));
}
System.out.format("limit-pos=%d \n", bbuf.limit() - cbuf.position());
}
大文件词频计算(讲解见视频)
package coding.buffer;
import org.junit.Test;
import java.io.*;
import java.nio.channels.FileChannel;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.StringTokenizer;
import java.util.concurrent.*;
public class WordCount {
final ForkJoinPool pool = ForkJoinPool.commonPool();
private static HashMap<String, Integer> countByString(String str){
var map = new HashMap<String, Integer>();
StringTokenizer tokenizer = new StringTokenizer(str);
while(tokenizer.hasMoreTokens()) {
var word = tokenizer.nextToken();
incKey(word, map, 1);
}
return map;
}
private static void incKey(String key, HashMap<String, Integer> map, Integer n) {
if(map.containsKey(key)){
map.put(key, map.get(key) + n);
} else {
map.put(key, n);
}
}
class CountTask implements Callable<HashMap<String, Integer>> {
private final long start;
private final long end;
private final String fileName;
public CountTask(String fileName, long start, long end) {
this.fileName = fileName;
this.start = start;
this.end = end;
}
@Override
public HashMap<String, Integer> call() throws Exception {
var map = new HashMap<String, Integer>();
var channel = new RandomAccessFile(this.fileName, "rw").getChannel();
var mbuf = channel.map(
FileChannel.MapMode.READ_ONLY,
this.start,
this.end - this.start
);
var str = StandardCharsets.US_ASCII.decode(mbuf).toString();
return countByString(str);
}
}
public void run(String fileName, long chunkSize) throws ExecutionException, InterruptedException {
var file = new File(fileName);
var fileSize = file.length();
long position = 0;
var tasks = new ArrayList<Future<HashMap<String,Integer>>>();
var startTime = System.currentTimeMillis();
while(position < fileSize) {
var next = Math.min(position + chunkSize, fileSize);
var task = new CountTask(fileName, position, next);
position = next;
var future = pool.submit(task);
tasks.add(future);
}
System.out.format("divided into %d tasks\n", tasks.size());
var totalMap = new HashMap<String, Integer>();
for(var task : tasks) {
var map = task.get();
for(var entry : map.entrySet()) {
incKey(entry.getKey(), totalMap, entry.getValue());
}
}
System.out.println("time:" + (System.currentTimeMillis() - startTime) + "ms");
System.out.println("total:" + totalMap.size());
System.out.println(totalMap.get("ababb"));
}
@Test
public void count() throws ExecutionException, InterruptedException {
var counter = new WordCount();
counter.run("word", 1024*1024);
}
@Test
public void compare_with_single() throws IOException {
var in = new BufferedInputStream(new FileInputStream("word"));
var buf = new byte[4*1024];
var len = 0;
var total = new HashMap<String, Integer>();
var startTime = System.currentTimeMillis();
while((len = in.read(buf)) != -1) {
var bytes = Arrays.copyOfRange(buf, 0, len);
var str = new String(bytes);
var hashMap = countByString(str);
for(var entry : hashMap.entrySet()) {
var key = entry.getKey();
incKey(key, total, entry.getValue());
}
}
System.out.println("time:" + (System.currentTimeMillis() - startTime) + "ms");
System.out.println(total.get("ababb"));
System.out.println(total.size());
}
}