大家好,欢迎来到IT知识分享网。
中文分词
ChineseParse.cs
using System;
using System.Collections;
using System.IO;
using System.Text.RegularExpressions;
namespace FullTextSearch.Common
{
/// <summary>
/// 中文分词器。
/// </summary>
public class ChineseParse
{
private static readonly ChineseWordsHashCountSet _countTable;
static ChineseParse()
{
_countTable = new ChineseWordsHashCountSet();
InitFromFile("ChineseDictionary.txt");
}
/// <summary>
/// 从指定的文件中初始化中文词语字典和字符串次数字典。
/// </summary>
/// <param name="fileName">文件名</param>
private static void InitFromFile(string fileName)
{
string path = Path.Combine(Directory.GetCurrentDirectory(), @"..\..\Common\", fileName);
if (File.Exists(path))
{
using (StreamReader sr = File.OpenText(path))
{
string s = "";
while ((s = sr.ReadLine()) != null)
{
ChineseWordUnit _tempUnit = InitUnit(s);
_countTable.InsertWord(_tempUnit.Word);
}
}
}
}
/// <summary>
/// 将一个字符串解析为ChineseWordUnit。
/// </summary>
/// <param name="s">字符串</param>
/// <returns>解析得到的ChineseWordUnit</returns>
/// 4
/// 0
private static ChineseWordUnit InitUnit(string s)
{
var reg = new Regex(@"\s+");
string[] temp = reg.Split(s);
//if (temp.Length != 2)
//{
// throw new Exception("字符串解析错误:" + s);
//}
if (temp.Length != 1)
{
throw new Exception("字符串解析错误:" + s);
}
return new ChineseWordUnit(temp[0], Int32.Parse("1"));
}
/// <summary>
/// 分析输入的字符串,将其切割成一个个的词语。
/// </summary>
/// <param name="s">待切割的字符串</param>
/// <returns>所切割得到的中文词语数组</returns>
public static string[] ParseChinese(string s)
{
int _length = s.Length;
string _temp = String.Empty;
var _words = new ArrayList();
for (int i = 0; i < s.Length;)
{
_temp = s.Substring(i, 1);
if (_countTable.GetCount(_temp) > 1)
{
int j = 2;
for (; i + j < s.Length + 1 && _countTable.GetCount(s.Substring(i, j)) > 0; j++)
{
}
_temp = s.Substring(i, j - 1);
i = i + j - 2;
}
i++;
_words.Add(_temp);
}
var _tempStringArray = new string[_words.Count];
_words.CopyTo(_tempStringArray);
return _tempStringArray;
}
}
}
IT知识分享网
ChineseWordsHashCountSet.cs
IT知识分享网using System.Collections;
namespace FullTextSearch.Common
{
/// <summary>
/// 记录字符串出现在中文字典所录中文词语的前端的次数的字典类。如字符串"中"出现在"中国"的前端,则在字典中记录一个次数。
/// </summary>
public class ChineseWordsHashCountSet
{
/// <summary>
/// 记录字符串在中文词语中出现次数的Hashtable。键为特定的字符串,值为该字符串在中文词语中出现的次数。
/// </summary>
private readonly Hashtable _rootTable;
/// <summary>
/// 类型初始化。
/// </summary>
public ChineseWordsHashCountSet()
{
_rootTable = new Hashtable();
}
/// <summary>
/// 查询指定字符串出现在中文字典所录中文词语的前端的次数。
/// </summary>
/// <param name="s">指定字符串</param>
/// <returns>字符串出现在中文字典所录中文词语的前端的次数。若为-1,表示不出现。</returns>
public int GetCount(string s)
{
if (!_rootTable.ContainsKey(s.Length))
{
return -1;
}
var _tempTable = (Hashtable) _rootTable[s.Length];
if (!_tempTable.ContainsKey(s))
{
return -1;
}
return (int) _tempTable[s];
}
/// <summary>
/// 向次数字典中插入一个词语。解析该词语,插入次数字典。
/// </summary>
/// <param name="s">所处理的字符串。</param>
public void InsertWord(string s)
{
for (int i = 0; i < s.Length; i++)
{
string _s = s.Substring(0, i + 1);
InsertSubString(_s);
}
}
/// <summary>
/// 向次数字典中插入一个字符串的次数记录。
/// </summary>
/// <param name="s">所插入的字符串。</param>
private void InsertSubString(string s)
{
if (!_rootTable.ContainsKey(s.Length) && s.Length > 0)
{
var _newHashtable = new Hashtable();
_rootTable.Add(s.Length, _newHashtable);
}
var _tempTable = (Hashtable) _rootTable[s.Length];
if (!_tempTable.ContainsKey(s))
{
_tempTable.Add(s, 1);
}
else
{
_tempTable[s] = (int) _tempTable[s] + 1;
}
}
}
}
ChineseWordUnit.cs
namespace FullTextSearch.Common
{
public struct ChineseWordUnit
{
private readonly int _power;
private readonly string _word;
/// <summary>
/// 结构初始化。
/// </summary>
/// <param name="word">中文词语</param>
/// <param name="power">该词语的权重</param>
public ChineseWordUnit(string word, int power)
{
_word = word;
_power = power;
}
/// <summary>
/// 中文词语单元所对应的中文词。
/// </summary>
public string Word
{
get { return _word; }
}
/// <summary>
/// 该中文词语的权重。
/// </summary>
public int Power
{
get { return _power; }
}
}
}
#ChineseDictionary.txt
主窗体界面
MainManager.cs
IT知识分享网using System;using System.Collections.Generic;using System.Data;using System.Drawing;using System.Windows.Forms;using FullTextSearch.Common;using Npgsql;namespace FullTextSearch{ public partial class MainManager : Form { private readonly PostgreSQL pg = new PostgreSQL(); private readonly SQLquerys sqlQuerys = new SQLquerys(); private char analysisType; private string createConnString = ""; private DataSet dataSet = new DataSet(); private DataTable dataTable = new DataTable(); private char odabirAndOr; private char vrstaPretrazivanja; public MainManager() { InitializeComponent(); rbtn_AND.Checked = true; rbtnNeizmjenjeni.Checked = true; odabirAndOr = '*'; radioButton_Day.Checked = true; radioButton_Day.Checked = true; } private void Form1_Load(object sender, EventArgs e) { gb_unosPodataka.Enabled = false; groupBox_Search.Enabled = false; groupBox_Analysis.Enabled = false; button_Disconnect.Enabled = false; button_Pretrazi.BackColor = Color.WhiteSmoke; button_Disconnect.BackColor = Color.WhiteSmoke; button_unosTekstaUBazu.BackColor = Color.WhiteSmoke; button1.BackColor = Color.WhiteSmoke; } private void button_unosTekstaUBazu_Click(object sender, EventArgs e) { string searchTextBoxString = rTB_unosTextaUBazu.Text; if (searchTextBoxString != "") { pg.insertIntoTable(searchTextBoxString, pg.conn); MessageBox.Show(searchTextBoxString + " 添加到数据库!"); rTB_unosTextaUBazu.Clear(); } else { MessageBox.Show("不允许空数据!"); } } private void button_Pretrazi_Click(object sender, EventArgs e) { string stringToSearch; string sql; string highlitedText; string rank; string check; stringToSearch = txt_Search.Text.Trim(); var list = new List<string>(ChineseParse.ParseChinese(stringToSearch)); ; sql = sqlQuerys.createSqlString(list, odabirAndOr, vrstaPretrazivanja); richTextBox1.Text = sql; check = sqlQuerys.testIfEmpty(stringToSearch); pg.insertIntoAnalysisTable(stringToSearch, pg.conn); pg.openConnection(); var command = new NpgsqlCommand(sql, pg.conn); NpgsqlDataReader reader = command.ExecuteReader(); int count = 0; linkLabel_Rezultat.Text = " "; while (reader.Read()) { highlitedText = reader[1].ToString(); rank = reader[3].ToString(); linkLabel_Rezultat.Text += highlitedText + "[" + rank + "]\n"; count++; } labelBrojac.Text = "找到的文件数量: " + count; pg.closeConnection(); } private void rbtn_AND_CheckedChanged(object sender, EventArgs e) { odabirAndOr = '*'; } private void rbtn_OR_CheckedChanged(object sender, EventArgs e) { odabirAndOr = '+'; } private void rbtnNeizmjenjeni_CheckedChanged(object sender, EventArgs e) { vrstaPretrazivanja = 'A'; } private void rbtn_Rijecnici_CheckedChanged(object sender, EventArgs e) { vrstaPretrazivanja = 'B'; } private void rbtn_Fuzzy_CheckedChanged(object sender, EventArgs e) { vrstaPretrazivanja = 'C'; } private void button_Connect_Click(object sender, EventArgs e) { if (connectMe()) { gb_unosPodataka.Enabled = true; groupBox_Search.Enabled = true; groupBox_Analysis.Enabled = true; textBox_Database.Enabled = false; textBox_IP.Enabled = false; textBox_Port.Enabled = false; textBox_Password.Enabled = false; textBox_UserID.Enabled = false; button_Connect.Enabled = false; button_Disconnect.Enabled = true; button_Pretrazi.BackColor = Color.SkyBlue; button_Disconnect.BackColor = Color.IndianRed; button_unosTekstaUBazu.BackColor = Color.MediumSeaGreen; button1.BackColor = Color.MediumSeaGreen; button_Connect.BackColor = Color.WhiteSmoke; } } private void button_Disconnect_Click(object sender, EventArgs e) { gb_unosPodataka.Enabled = false; groupBox_Search.Enabled = false; groupBox_Analysis.Enabled = false; textBox_Database.Enabled = true; textBox_IP.Enabled = true; textBox_Port.Enabled = true; textBox_Password.Enabled = true; textBox_UserID.Enabled = true; button_Connect.Enabled = true; button_Disconnect.Enabled = false; button_Pretrazi.BackColor = Color.WhiteSmoke; button_Disconnect.BackColor = Color.WhiteSmoke; button_unosTekstaUBazu.BackColor = Color.WhiteSmoke; button1.BackColor = Color.WhiteSmoke; button_Connect.BackColor = Color.MediumSeaGreen; txt_Search.Text = ""; linkLabel_Rezultat.Text = ""; richTextBox1.Text = ""; labelBrojac.Text = ""; } private bool connectMe() { createConnString += "Server=" + textBox_IP.Text + ";Port=" + textBox_Port.Text + ";User Id=" + textBox_UserID.Text + ";Password=" + textBox_Password.Text + ";Database=" + textBox_Database.Text + ";"; sqlQuerys.setTheKey(createConnString); pg.setConnectionString(); pg.setConnection(); if (pg.openConnection()) { MessageBox.Show("您已成功连接!"); pg.closeConnection(); return true; } return false; } private void button1_Click(object sender, EventArgs e) { string selectedTimestamp; selectedTimestamp = dateTimePicker_From.Value.ToString("dd-MM-yyyy hh:mm:ss") + " " + dateTimePicker_To.Value.ToString("dd-MM-yyyy hh:mm:ss"); var analize = new Analysis(selectedTimestamp, analysisType); analize.Show(); } private void radioButton_Day_CheckedChanged(object sender, EventArgs e) { analysisType = 'D'; } private void radioButton_Hour_CheckedChanged(object sender, EventArgs e) { analysisType = 'H'; } }}
SQLquerys.cs代码:
using System.Collections.Generic;namespace FullTextSearch{ internal class SQLquerys { private static string giveMeTheKey; private static int tempInt = 1; //设置连接字符串 public void setTheKey(string connString) { giveMeTheKey = connString; giveMeTheKey += ""; } //将连接字符串存储在静态变量中 public string getTheKey() { giveMeTheKey += ""; return giveMeTheKey; } public void setCounter() { tempInt = 1; } //根据AND和OR的选择分析字符串进行搜索 public string createFunctionString(List<string> searchList, char selector) { string TempString = ""; string[] TempField = null; int i = 0; int j = 0; foreach (string searchStringInList in searchList) { if (j != 0) { if (selector == '+') TempString = TempString + " | "; else if (selector == '*') TempString = TempString + " & "; } j = 1; TempField = splitListForInput(searchStringInList); TempString = TempString + "("; foreach (string justTempString in TempField) { if (i != 0) { TempString = TempString + " & "; } TempString = TempString + justTempString; i = 1; } TempString = TempString + ")"; i = 0; } return TempString; } //帮助方法 public List<string> splitInputField(string[] inputField) { var unfinishedList = new List<string>(); foreach (string splitString in inputField) { unfinishedList.Add(splitString); } return unfinishedList; } //帮助方法 public string[] splitListForInput(string inputString) { string[] parsedList = null; parsedList = inputString.Split(' '); return parsedList; } //在PostgreSQL中创建ts功能的功能,用于字典搜索 public string createTsFunction(string tsString) { string tsHeadline = ""; string tsRank = ""; string tsFunction = ""; tsHeadline = ",\n ts_headline(\"content\", to_tsquery('" + tsString + "')), \"content\""; tsRank = ",\n ts_rank(to_tsvector(\"content\"), to_tsquery('" + tsString + "')) rank"; tsFunction = tsHeadline + tsRank; return tsFunction; } //创建SQL查询依赖于选择哪种类型的搜索,也取决于AND或OR选择器 public string createSqlString(List<string> searchList, char selector, char vrstaPretrazivanja) { string selectString = ""; string myTempString = ""; string TempString = ""; int i = 0; TempString = createFunctionString(searchList, selector); TempString = createTsFunction(TempString); selectString = "SELECT \"id\"" + TempString + "\nFROM \"texttable\" \nWHERE "; if (vrstaPretrazivanja == 'A') { foreach (string myString in searchList) { if (i == 0) { myTempString = myTempString + "\"content\" LIKE '%" + myString + "%' "; i++; } else { if (selector == '*') myTempString = myTempString + "\nAND \"content\" LIKE '%" + myString + "%' "; else if (selector == '+') myTempString = myTempString + "\nOR \"content\" LIKE '%" + myString + "%' "; } } } else if (vrstaPretrazivanja == 'B') { foreach (string myString in searchList) { string temporalString = ""; string[] testingString = myString.Split(' '); for (int k = 0; k < testingString.Length; k++) { if (k != testingString.Length - 1) { temporalString += testingString[k] + " & "; } else { temporalString += testingString[k]; } } if (i == 0) { myTempString = myTempString + "to_tsvector(\"content\") @@ to_tsquery('english', '" + temporalString + "')"; i++; } else { if (selector == '*') myTempString = myTempString + "\nAND to_tsvector(\"content\") @@ to_tsquery('english', '" + temporalString + "')"; else if (selector == '+') myTempString = myTempString + "\nOR to_tsvector(\"content\") @@ to_tsquery('english', '" + temporalString + "')"; } } } if (vrstaPretrazivanja == 'C') { foreach (string myString in searchList) { if (i == 0) { myTempString = myTempString + "\"content\" % '" + myString + "' "; i++; } else { if (selector == '*') myTempString = myTempString + "\nAND \"content\" % '" + myString + "' "; else if (selector == '+') myTempString = myTempString + "\nOR \"content\" % '" + myString + "' "; } } } selectString = selectString + myTempString + "\nORDER BY rank DESC"; return selectString; } public string testIfEmpty(string searchedText) { string checkingIfEmpty = "SELECT * FROM \"analysisTable\" WHERE \"searchedtext\" =' " + searchedText + "'"; return checkingIfEmpty; } public string queryForAnalysis(char analysisChoice) { string myTestsql = ""; if (analysisChoice == 'H') { //这个查询是这样写的只是为了测试的目的,它需要改变 myTestsql = "SELECT * FROM crosstab('SELECT CAST((\"searchedtext\") AS text) searchedText," + " CAST(EXTRACT(HOUR FROM \"timeOfSearch\") AS int) AS sat, CAST(COUNT(*) AS int) AS broj FROM \"analysisTable\" GROUP BY \"searchedText\", sat" + " ORDER BY \"searchedtext\", sat', 'SELECT rbrSata FROM sat ORDER BY rbrSata') AS pivotTable (\"searchedText\" TEXT, t0_1 INT, t1_2 INT" + ", t2_3 INT, t3_4 INT, t4_5 INT, t5_6 INT, t6_7 INT, t7_8 INT, t8_9 INT, t9_10 INT, t10_11 INT, t11_12 INT, t12_13 INT" + ", t13_14 INT, t14_15 INT, t15_16 INT, t16_17 INT, t17_18 INT, t18_19 INT, t19_20 INT, t20_21 INT, t21_22 INT, t22_23 INT, t23_00 INT) ORDER BY \"searchedText\""; return myTestsql; } if (analysisChoice == 'D') { //这个查询是这样写的只是为了测试的目的,它需要改变 myTestsql += "SELECT *FROM crosstab ('SELECT CAST((\"searchedtext\") AS text) AS searchedText, CAST(EXTRACT(DAY FROM \"dateOfSearch\") AS int) AS dan" + ", CAST(COUNT(*) AS int) AS broj FROM \"analysisTable\" GROUP BY \"searchedText\", " + "dan ORDER BY \"searchedtext\", dan', 'SELECT rbrDana FROM dan ORDER BY rbrDana') AS pivotTable(\"searchedtext\" TEXT"; return myTestsql; } return myTestsql; } //此方法用于解析日期 public int[] parseForDates(string date) { string[] temp; var tempInt = new int[3]; temp = date.Split('-'); for (int i = 0; i < 3; i++) { tempInt[i] = int.Parse(temp[i]); } return tempInt; } //此代码用于创建分析,它执行一些日期/时间操作,以便能够为选定的日期/时间创建分析。 public string createSqlForDayAnalysis(string dateFrom, string dateTo) { string insertIntoTempTable = ""; string dateTimeForAnalysis = ""; int[] tempFrom = parseForDates(dateFrom); int[] tempTo = parseForDates(dateTo); //月份变更算法 while (tempFrom[0] != tempTo[0] || tempFrom[1] != tempTo[1]) { if (tempFrom[1] == tempTo[1]) { if (tempFrom[0] != tempTo[0]) { for (int i = tempInt + 1; tempFrom[0] + 2 < tempTo[0] + 2; i++) { insertIntoTempTable += "INSERT INTO \"dan\" VALUES (" + i + ");"; dateTimeForAnalysis += ",dd" + tempFrom[0] + tempFrom[1] + tempFrom[2] + " INT"; tempInt = i; tempFrom[0]++; } } } if (tempFrom[1] != tempTo[1]) { if (tempFrom[1]%2 == 0 || tempFrom[1] == 7 || tempFrom[1] == 1) { for (int i = tempInt; tempFrom[0] < 31 && tempFrom[1] != tempTo[1]; i++) { insertIntoTempTable += "INSERT INTO \"dan\" VALUES (" + i + ");"; dateTimeForAnalysis += ", dd" + tempFrom[0] + tempFrom[1] + tempFrom[2] + " INT"; tempInt = i; tempFrom[0]++; if (tempFrom[0] == 31) { tempFrom[1]++; tempFrom[0] = 1; } } } } } dateTimeForAnalysis += ") ORDER BY \"searchedtext\""; return dateTimeForAnalysis + "#" + insertIntoTempTable; } }}
PostgreSQL.cs代码:
using System;using System.Windows.Forms;using Npgsql;using NpgsqlTypes;namespace FullTextSearch{ public class PostgreSQL { private static int tempInt = 1; private readonly SQLquerys sql = new SQLquerys(); public NpgsqlConnection conn; public string connectionstring; private string newConnString; public PostgreSQL() { setConnectionString(); setConnection(); } public void setConnectionString() { newConnString = sql.getTheKey(); connectionstring = String.Format(newConnString); setConnection(); } public void setConnection() { conn = new NpgsqlConnection(connectionstring); } public bool openConnection() { try { conn.Open(); return true; } catch { MessageBox.Show("Unable to connect! Check parameters!"); return false; } } public void closeConnection() { conn.Close(); } public void insertIntoTable(string textToInsert, NpgsqlConnection nsqlConn) { string mySqlString = "INSERT INTO \"texttable\" (\"content\") VALUES (@Param1)"; var myParameter = new NpgsqlParameter("@Param1", NpgsqlDbType.Text); myParameter.Value = textToInsert; openConnection(); var myCommand = new NpgsqlCommand(mySqlString, nsqlConn); myCommand.Parameters.Add(myParameter); myCommand.ExecuteNonQuery(); closeConnection(); } public void insertIntoAnalysisTable(string textToInsert, NpgsqlConnection nsqlConn) { string dateTime = DateTime.Now.ToString(); string[] temp; temp = dateTime.Split(' '); string mySqlString = "INSERT INTO \"analysistable\" (\"searchedtext\", \"dateofsearch\", \"timeofsearch\") VALUES ('" + textToInsert + "', '" + temp[0] + "'" + ", '" + temp[1] + "');"; openConnection(); var myCommand = new NpgsqlCommand(mySqlString, nsqlConn); myCommand.ExecuteNonQuery(); closeConnection(); } public void executeQuery(string queryText, NpgsqlConnection nsqlConn) { openConnection(); var myCommand = new NpgsqlCommand(queryText, nsqlConn); myCommand.ExecuteNonQuery(); closeConnection(); } public void createTempTable(NpgsqlConnection nsqlConn, char analysisType, string dateFrom, string dateTo, string splitMe) { if (analysisType == 'H') { string dropIfExists = "DROP TABLE IF EXISTS \"sat\";"; string createTempTable = "CREATE TABLE IF NOT EXISTS \"sat\" (rbrSata INT);"; string insertIntoTempTable = ""; for (int i = 0; i < 24; i++) { insertIntoTempTable += "INSERT INTO \"sat\" VALUES (" + i + ");"; } openConnection(); var commandDrop = new NpgsqlCommand(dropIfExists, nsqlConn); commandDrop.ExecuteNonQuery(); var commandCreate = new NpgsqlCommand(createTempTable, nsqlConn); commandCreate.ExecuteNonQuery(); var commandInsert = new NpgsqlCommand(insertIntoTempTable, nsqlConn); commandInsert.ExecuteNonQuery(); closeConnection(); } else if (analysisType == 'D') { string dropIfExists = "DROP TABLE IF EXISTS \"dan\";"; string createTempTable = "CREATE TABLE IF NOT EXISTS \"dan\" (rbrDana INT);"; string insertIntoTempTable = splitMe; openConnection(); var commandDrop = new NpgsqlCommand(dropIfExists, nsqlConn); commandDrop.ExecuteNonQuery(); var commandCreate = new NpgsqlCommand(createTempTable, nsqlConn); commandCreate.ExecuteNonQuery(); var commandInsert = new NpgsqlCommand(insertIntoTempTable, nsqlConn); commandInsert.ExecuteNonQuery(); closeConnection(); } } }}
PostGreSQL sql脚本:
CREATE TABLE public.analysistable( id integer NOT NULL DEFAULT nextval('analysistable_id_seq'::regclass), searchedtext text COLLATE pg_catalog."default" NOT NULL, dateofsearch date NOT NULL, timeofsearch time without time zone NOT NULL, CONSTRAINT analysistable_pkey PRIMARY KEY (id))WITH ( OIDS = FALSE)TABLESPACE pg_default;ALTER TABLE public.analysistable OWNER to king;
CREATE TABLE public.texttable( id integer NOT NULL DEFAULT nextval('texttable_id_seq'::regclass), content text COLLATE pg_catalog."default" NOT NULL, CONSTRAINT texttable_pkey PRIMARY KEY (id))WITH ( OIDS = FALSE)TABLESPACE pg_default;ALTER TABLE public.texttable OWNER to king;
运行结果如图:
免责声明:本站所有文章内容,图片,视频等均是来源于用户投稿和互联网及文摘转载整编而成,不代表本站观点,不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益,请在线联系站长,一经查实,本站将立刻删除。 本文来自网络,若有侵权,请联系删除,如若转载,请注明出处:https://yundeesoft.com/6530.html