加入收藏 | 设为首页 | 会员中心 | 我要投稿 核心网 (https://www.hxwgxz.com/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 大数据 > 正文

利用文本挖掘技术来找出网络中的“小鲜词”

发布时间:2021-01-20 10:15:34 所属栏目:大数据 来源:网络整理
导读:开始之前,先看一下从人人网中发现的90后用户爱用的词 是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜) 项目结构 当然,text.dat和common.d

这里写图片描述

分词处理,具体看实现

Chunk.java

package grid.text.participle;

import grid.text.dic.CnDictionary;

import java.util.List;


public class Chunk implements Comparable<Chunk> {

    private List<String> list;

    private int len = 0;

    private double avg = 0;

    private double variance = 0;

    public Chunk(List<String> list) {
        this.list = list;
        init();
    }

    private void init() {

        for (String s : list) {
            len += s.length();
        }
        avg = (double) len / list.size();

        for (String s : list) {
            variance += Math.pow(avg - s.length(),2);
        }
        variance = Math.sqrt(variance);
    }

    public int getLen() {
        return len;
    }

    public double getAvg() {
        return avg;
    }

    public double getVariance() {
        return variance;
    }

    public String getHead() {
        if (null == list || list.isEmpty()) {
            return "";
        }
        return list.get(0);
    }

    private int compareDouble(double d1,double d2) {
        if (d1 - d2 < -0.0000001D) {
            return 1;
        } else if (d1 - d2 > 0.0000001D) {
            return -1;
        }
        return 0;
    }

    @Override
    public int compareTo(Chunk o) {

        if (len != o.len) {
            return o.len - len;
        }

        int d = compareDouble(avg,o.avg);
        if (0 != d) {
            return d;
        }

        d = compareDouble(variance,o.variance);
        if (0 != d) {
            return d;
        }

        CnDictionary dictionary = CnDictionary.Instance();

        double rateSrc = 0,rateDest = 0;
        for (String s : list) {
            if (1 == s.length()) {
                rateSrc += dictionary.rate(s.charAt(0));
            }
        }
        for (String s : o.list) {
            if (1 == s.length()) {
                rateDest += dictionary.rate(s.charAt(0));
            }
        }
        return compareDouble(rateSrc,rateDest);
    }

    public String toString() {
        return list.toString();
    }
}

ChunkStream.java

package grid.text.participle;

import grid.common.Node;
import grid.common.TextUtils;
import grid.common.Tree;
import grid.text.dic.CnDictionary;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class ChunkStream {

    /** * Define the max supposed word length * * You could shorten the value if you don't need too long participle result */
    private static final int MAX_WORD_LEN = 7;

    /** * Define the predict level while execute participle. * * Negligible accuracy will be promoted if you increase this value */
    private static final int PREDICT_LEVEL = 3;

    private static CnDictionary dictionary = CnDictionary.Instance();

    public String next(String text,int off) {
        Tree<String> root = new Tree<String>("ROOT");
        recurse(root,off,text,0);
        List<Node<String>> list = root.getLeaves();
        List<Chunk> chunkList = new ArrayList<Chunk>();
        for (Node<String> node : list) {
            chunkList.add(new Chunk(node.getBranchPath()));
        }
        Collections.sort(chunkList);
        return chunkList.get(0).getHead();

    }

    private void recurse(Node<String> node,String text,int predictDeep) {
        int len = MAX_WORD_LEN + off > text.length() ? text.length() - off
                : MAX_WORD_LEN;

        while (predictDeep < PREDICT_LEVEL) {
            if (len < 1) {
                return;
            }

            String s = text.substring(off,off + len);
            if (len < 2) {
                if (!TextUtils.isCnLetter(text.charAt(off))) {
                    break;
                }
                recurse(node.add(s),off + 1,predictDeep + 1);
            } else if (dictionary.contains(s)) {
                recurse(node.add(s),off + s.length(),predictDeep + 1);
            }
            len--;
        }
    }
}

MechanicalParticiple.java

package grid.text.participle;

import grid.common.TextUtils;

import java.util.Vector;


public class MechanicalParticiple {

    public Vector<String> partition(String document) {
        Vector<String> vector = new Vector<String>();
        final int docLen = document.length();
        int off = 0;
        char c;
        String seg = "";
        ChunkStream stream = new ChunkStream();

        while (off < docLen) {
            c = document.charAt(off);
            if (TextUtils.isEnLetter(c) || TextUtils.isNumeric(c)) {
                seg += c;
                off++;
            } else if (TextUtils.isCnLetter(c)) {
                if (!TextUtils.isBlank(seg)) {
                    vector.add(seg);
                    seg = "";
                }
                String word = stream.next(document,off);
                if (!TextUtils.isBlank(word)) {
                    vector.add(word);
                    off += word.length();
                }
            } else {
                if (!TextUtils.isBlank(seg)) {
                    vector.add(seg);
                    seg = "";
                }

                /** * TODO: Uncomment the "ELSE IF" clause if you would like to * reserve punctuations */

                // else if (!TextUtils.isBlank("" + c)) { vector.add("" + c); }

                off++;
            }
        }
        if (!TextUtils.isBlank(seg)) {
            vector.add(seg);
        }
        return vector;

    }
}

selector

(编辑:核心网)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

热点阅读