| 23 | | public abstract Hashtable<String,Integer> getTokenFrequencies(DataColumn target_column); |
|---|
| | 34 | /** |
|---|
| | 35 | * Loads precalculated token frequencies from a relational database |
|---|
| | 36 | * |
|---|
| | 37 | * @return A hashtable containing frequencies, indexed by token |
|---|
| | 38 | */ |
|---|
| | 39 | public Hashtable<String,Integer> getTokenFrequencies(DataColumn target_column, ScaleWeightSetting topbottom, Float limit) { |
|---|
| | 40 | StringBuilder query = new StringBuilder("SELECT token, frequency FROM " + token_table); |
|---|
| | 41 | Integer N = Math.round(limit); |
|---|
| | 42 | switch (topbottom) { |
|---|
| | 43 | case BottomN: |
|---|
| | 44 | query.append(" ORDER BY frequency ASC LIMIT " + N); |
|---|
| | 45 | break; |
|---|
| | 46 | case TopN: |
|---|
| | 47 | query.append(" ORDER BY frequency DESC LIMIT " + N); |
|---|
| | 48 | break; |
|---|
| | 49 | case TopNPercent: |
|---|
| | 50 | // Maybe an exception here? |
|---|
| | 51 | if(N <= 1) { |
|---|
| | 52 | int tokens = getDistinctRecordCount(target_column); |
|---|
| | 53 | int token_limit = Math.round(tokens*limit); |
|---|
| | 54 | query.append(" ORDER BY frequency DESC LIMIT " + token_limit); |
|---|
| | 55 | } |
|---|
| | 56 | else { |
|---|
| | 57 | System.out.println("Error: N should be between 0 and 1"); |
|---|
| | 58 | } |
|---|
| | 59 | break; |
|---|
| | 60 | case BottomNPercent: |
|---|
| | 61 | if(N <= 1) { |
|---|
| | 62 | int tokens = getDistinctRecordCount(target_column); |
|---|
| | 63 | int token_limit = Math.round(tokens*limit); |
|---|
| | 64 | query.append(" ORDER BY frequency ASC LIMIT " + token_limit); |
|---|
| | 65 | } |
|---|
| | 66 | else { |
|---|
| | 67 | System.out.println("Error: N should be between 0 and 1"); |
|---|
| | 68 | } |
|---|
| | 69 | break; |
|---|
| | 70 | case AboveN: |
|---|
| | 71 | query.append(" WHERE frequency > " + N); |
|---|
| | 72 | break; |
|---|
| | 73 | case BelowN: |
|---|
| | 74 | query.append(" WHERE frequency < " + N); |
|---|
| | 75 | break; |
|---|
| | 76 | } |
|---|
| | 77 | |
|---|
| | 78 | ResultSet frequency_rs = sw_connection.getResultSet(query.toString()); |
|---|
| | 79 | Hashtable<String,Integer> frequencies = new Hashtable<String,Integer>(2*N); |
|---|
| | 80 | try { |
|---|
| | 81 | while(frequency_rs != null && frequency_rs.next()) { |
|---|
| | 82 | String token = frequency_rs.getString(1); |
|---|
| | 83 | Integer frequency = frequency_rs.getInt(2); |
|---|
| | 84 | System.out.println(token + " :: " + frequency); |
|---|
| | 85 | frequencies.put(token, frequency); |
|---|
| | 86 | } |
|---|
| | 87 | } catch (SQLException e) { |
|---|
| | 88 | e.printStackTrace(); |
|---|
| | 89 | } |
|---|
| | 90 | return frequencies; |
|---|
| | 91 | } |
|---|
| | 93 | /** |
|---|
| | 94 | * |
|---|
| | 95 | * @param target_column |
|---|
| | 96 | * @param record_limit Determines the number of record that will be stored in memory |
|---|
| | 97 | */ |
|---|
| | 98 | public abstract void analyzeTokenFrequencies(DataColumn target_column, int record_limit); |
|---|
| | 99 | |
|---|
| | 100 | public abstract int getRecordCount(); |
|---|
| | 101 | public abstract int getNonNullCount(DataColumn target_column); |
|---|
| | 102 | public abstract int getNullCount(DataColumn target_column); |
|---|
| | 103 | |
|---|
| | 104 | public int getDistinctRecordCount(DataColumn target_column) { |
|---|
| | 105 | String query = "SELECT COUNT(token) FROM " + token_table + " WHERE datasource_id = " + datasource_id + " AND field_id = " + target_column.getColumnID(); |
|---|
| | 106 | return sw_connection.executeQuery(query); |
|---|
| | 107 | } |
|---|
| | 108 | |
|---|
| | 109 | public boolean deleteAnalysis(DataColumn target_column) { |
|---|
| | 110 | String query = "DELETE FROM " + token_table + " WHERE datasource_id = " + datasource_id; |
|---|
| | 111 | return sw_connection.executeUpdate(query); |
|---|
| | 112 | } |
|---|
| | 113 | |
|---|
| | 114 | public void addOrUpdateToken(DataColumn target_column, String datasource_id, String token, Integer frequency) { |
|---|
| | 115 | int db_frequency = sw_connection.getTokenFrequency(target_column,datasource_id, token); |
|---|
| | 116 | // Database and memory are at the same state, we don't need to do anything |
|---|
| | 117 | if(db_frequency != frequency) { |
|---|
| | 118 | // New record, not in the database |
|---|
| | 119 | if(frequency == 1 || db_frequency == 0) { |
|---|
| | 120 | sw_connection.insertToken(target_column, datasource_id, token, frequency); |
|---|
| | 121 | } |
|---|
| | 122 | else { |
|---|
| | 123 | sw_connection.updateTokenFrequency(target_column, datasource_id, token, frequency); |
|---|
| | 124 | } |
|---|
| | 125 | } |
|---|
| | 126 | } |
|---|
| | 127 | |
|---|
| | 128 | public void setNonNullCount(DataColumn target_column) { |
|---|
| | 129 | target_column.setNonNullCont(getNonNullCount(target_column)); |
|---|
| | 130 | } |
|---|
| | 131 | |
|---|
| | 132 | public void setNullCount(DataColumn target_column) { |
|---|
| | 133 | target_column.setNullCount(getNullCount(target_column)); |
|---|
| | 134 | } |
|---|
| | 135 | |
|---|
| | 136 | public void setRecordCount() { |
|---|
| | 137 | reader.data_source.setRecordCount(getRecordCount()); |
|---|
| | 138 | } |
|---|