加入收藏 | 设为首页 | 会员中心 | 我要投稿 核心网 (https://www.hxwgxz.com/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 大数据 > 正文

利用文本挖掘技术来找出网络中的“小鲜词”

发布时间:2021-01-20 10:15:34 所属栏目:大数据 来源:网络整理
导读:开始之前,先看一下从人人网中发现的90后用户爱用的词 是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜) 项目结构 当然,text.dat和common.d

抽词程序

package grid.text.evolution;

import grid.common.TextUtils;
import grid.text.dic.CnDictionary;
import grid.text.index.CnPreviewTextIndexer;
import grid.text.index.TextIndexer;
import grid.text.selector.CnTextSelector;
import grid.text.selector.TextSelector;

import java.util.HashSet;
import java.util.Set;


public class NewWordDiscover {

    private CnDictionary dictionary;

    /** * Minimum word length */
    private final static int MIN_CANDIDATE_LEN = 2;

    /** * Maximum word length */
    private final static int MAX_CANDIDATE_LEN = 6;

    private static Set<Character> structuralLetterSet = new HashSet<Character>();

    private static char[] structuralLetters = { '我','你','您','他','她','谁','哪','那','这','的','了','着','也','是','有','不','在','与','呢','啊','呀','吧','嗯','哦','哈','呐' };

    static {
        for (char c : structuralLetters) {
            structuralLetterSet.add(c);
        }
    }

    public NewWordDiscover() {
        dictionary = CnDictionary.Instance();
    }

    /** * New word discover is based on statistic and entropy,better to sure * document size is in 100kb level,or you may get a unsatisfied result. * * @param document * @return */
    public Set<String> discover(String document) {

        Set<String> set = new HashSet<String>();
        TextIndexer indexer = new CnPreviewTextIndexer(document);
        TextSelector selector = new CnTextSelector(document,MIN_CANDIDATE_LEN,MAX_CANDIDATE_LEN);
        EntropyJudger judger = new EntropyJudger(indexer);
        String candidate;
        while (!selector.end()) {
            candidate = selector.next();
            if (TextUtils.isBlank(candidate)) {
                continue;
            }
            if (structuralLetterSet.contains(candidate.charAt(0))
                    || structuralLetterSet.contains(candidate.charAt(candidate
                            .length() - 1))) {
                continue;
            }
            // Replace IF clause with "set.contains(candidate)" if you want to
            // find new word without any dictionary
            if (dictionary.contains(candidate) || set.contains(candidate)) {
                selector.select();
            } else if (judger.judge(candidate)) {
                set.add(candidate);
            }
        }
        return set;
    }
}

index

(编辑:核心网)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

热点阅读