Skip to content

阿里面试题:中文乱码处理和大文件计算词频

中文乱码处理示例(讲解见视频)

    @Test
    public void test_chinese(){


        var charset = StandardCharsets.UTF_8;
        var decoder = charset.newDecoder();
        var bytes = charset.encode("长坂桥头杀气生,横枪立马眼圆睁。一声好似轰雷震,独退曹家百万兵。")
                .array();
        var bytes2 = Arrays.copyOfRange(bytes, 0, 11);
        var bbuf = ByteBuffer.allocate(12);
        var cbuf = CharBuffer.allocate(12);
        bbuf.put(bytes2);
        bbuf.flip();
        decoder.decode(bbuf, cbuf, true);
        cbuf.flip();

        var tmp = new char[cbuf.length()];
        while(cbuf.hasRemaining()) {
            cbuf.get(tmp);
            System.out.println("here:" + new String(tmp));
        }

        System.out.format("limit-pos=%d \n", bbuf.limit() - cbuf.position());
    }

大文件词频计算(讲解见视频)

package coding.buffer;

import org.junit.Test;

import java.io.*;
import java.nio.channels.FileChannel;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.StringTokenizer;
import java.util.concurrent.*;

public class WordCount {


    final ForkJoinPool pool = ForkJoinPool.commonPool();

    private static HashMap<String, Integer> countByString(String str){
        var map = new HashMap<String, Integer>();
        StringTokenizer tokenizer = new StringTokenizer(str);
        while(tokenizer.hasMoreTokens()) {
            var word = tokenizer.nextToken();
            incKey(word, map, 1);
        }
        return map;
    }

    private static void incKey(String key, HashMap<String, Integer> map, Integer n) {
        if(map.containsKey(key)){
            map.put(key, map.get(key) + n);
        } else {
            map.put(key, n);
        }

    }

    class CountTask implements Callable<HashMap<String, Integer>> {


        private final long start;
        private final long end;
        private final String fileName;

        public CountTask(String fileName, long start, long end) {
            this.fileName = fileName;
            this.start = start;
            this.end = end;
        }

        @Override
        public HashMap<String, Integer> call() throws Exception {
            var map = new HashMap<String, Integer>();
            var channel = new RandomAccessFile(this.fileName, "rw").getChannel();

            var mbuf = channel.map(
                    FileChannel.MapMode.READ_ONLY,
                    this.start,
                    this.end - this.start
            );
            var str = StandardCharsets.US_ASCII.decode(mbuf).toString();
            return countByString(str);
        }
    }



    public void run(String fileName, long chunkSize) throws ExecutionException, InterruptedException {
        var file = new File(fileName);
        var fileSize = file.length();
        long position = 0;
        var tasks = new ArrayList<Future<HashMap<String,Integer>>>();
        var startTime = System.currentTimeMillis();

        while(position < fileSize) {
            var next = Math.min(position + chunkSize, fileSize);
            var task = new CountTask(fileName, position, next);
            position = next;
            var future = pool.submit(task);
            tasks.add(future);
        }
        System.out.format("divided into %d tasks\n", tasks.size());


        var totalMap = new HashMap<String, Integer>();
        for(var task : tasks) {
            var map = task.get();
            for(var entry : map.entrySet()) {
                incKey(entry.getKey(), totalMap, entry.getValue());
            }
        }
        System.out.println("time:" + (System.currentTimeMillis() - startTime) + "ms");
        System.out.println("total:" + totalMap.size());
        System.out.println(totalMap.get("ababb"));
    }

    @Test
    public void count() throws ExecutionException, InterruptedException {
        var counter = new WordCount();
        counter.run("word", 1024*1024);
    }

    @Test
    public void compare_with_single() throws IOException {

        var in = new BufferedInputStream(new FileInputStream("word"));
        var buf = new byte[4*1024];
        var len = 0;
        var total = new HashMap<String, Integer>();
        var startTime = System.currentTimeMillis();
        while((len = in.read(buf)) != -1) {
            var bytes = Arrays.copyOfRange(buf, 0, len);
            var str = new String(bytes);
            var hashMap = countByString(str);
            for(var entry : hashMap.entrySet()) {
                var key = entry.getKey();
                incKey(key, total, entry.getValue());
            }
        }

        System.out.println("time:" + (System.currentTimeMillis() - startTime) + "ms");
        System.out.println(total.get("ababb"));
        System.out.println(total.size());
    }
}

文章来源于自己总结和网络转载,内容如有任何问题,请大佬斧正!联系我