import java.sql.*; import java.util.*; import java.io.*; import java.text.*; public class AddBook { public static final String url = "[url omitted]"; public static final String username = "[username omitted]"; public static final String passwd = "[password omitted]"; public static final String database = "[database omitted]"; public static final String driver = "org.gjt.mm.mysql.Driver"; private static final int NUM_TALLIES=3; private long start = 0; private int[] tally = new int[NUM_TALLIES + 1]; private Connection con; private Statement stmt; public AddBook() { tally[0] = tally[1] = 0; try { Class.forName(driver); con = DriverManager.getConnection(url, username, passwd); } catch (Exception e) { e.printStackTrace(System.err); } } private int addNewPage(int work, int pageInWork, int globalPosition, int lastPageId) throws SQLException { startTally(); if (lastPageId >= 0) { stmt.executeUpdate("UPDATE UniquePage SET global_end = " + globalPosition + " WHERE id = " + lastPageId); } stmt.executeUpdate("INSERT INTO UniquePage VALUES (0, " + work + ", " + pageInWork + ", " + (globalPosition + 1) + ", -1)"); endTally(2); startTally(); ResultSet rs = stmt.executeQuery("SELECT last_insert_id() FROM UniquePage LIMIT 1"); rs.next(); int pageId = rs.getInt(1); endTally(2); return pageId; } private void insertIntoPCW(int wordId, int pageId) throws SQLException { startTally(); ResultSet rs = stmt.executeQuery ("SELECT * FROM PageContainsWord WHERE " + "word=" + wordId + " AND page=" + pageId); endTally(0); if (rs.next()) return; startTally(); stmt.executeUpdate("INSERT INTO PageContainsWord VALUES (" + wordId + ", " + pageId + ")"); endTally(1); } private void insertIntoWI(int gid, int wordId, int prevWord) throws SQLException { startTally(); stmt.executeUpdate("INSERT INTO WordInstance VALUES(" + wordId + ", " + gid + ", " + prevWord + ")"); endTally(1); } private int getWordId(String root) throws SQLException { int id; startTally(); ResultSet rs = stmt.executeQuery ("SELECT * FROM Word where word=" + formatString(root)); if (!rs.next()) { rs = stmt.executeQuery("SELECT (MAX(id)+1) FROM Word"); rs.next(); id = rs.getInt(1); endTally(0); startTally(); stmt.executeUpdate("INSERT INTO Word VALUES (" + id + ", " + formatString(root) + ")"); endTally(1); System.err.println("Inserted: " + formatString(root) + " (" + id + ")"); } else { id = rs.getInt(1); endTally(0); } return id; } private void insertIntoWD(int globalPosition, String word) throws SQLException { startTally(); stmt.executeUpdate("INSERT INTO WordDetails VALUES (" + globalPosition + ", " + formatString(word) + ")"); endTally(1); } private static final int LINES_PER_PAGE = 70; public void parse(int workId) { int globalPosition = -1; int pageId = -1; int pageInWork = 0; int prevWord = -1; int lineOnPage = LINES_PER_PAGE; BufferedReader in = null; try { stmt = con.createStatement(); ResultSet rs = stmt.executeQuery("SELECT MAX(gpos) FROM WordInstance"); rs.next(); globalPosition = rs.getInt(1); in = new BufferedReader(new FileReader(workId + ".txt")); } catch (Exception e) { e.printStackTrace(System.err); } String line; while (true) { try { try { line = in.readLine(); } catch (NullPointerException e) { break; } if (line == null) { //end of the work, close off the last page stmt.executeUpdate("UPDATE UniquePage SET global_end = " + globalPosition + " WHERE id = " + pageId); break; } if (lineOnPage == LINES_PER_PAGE) { pageInWork++; lineOnPage = 1; pageId = addNewPage(workId, pageInWork, globalPosition, pageId); int selectPct = tally[0] * 100 / tally[NUM_TALLIES]; int updatePct = tally[1] * 100 / tally[NUM_TALLIES]; int pgPct = tally[2] * 100 / tally[NUM_TALLIES]; System.err.println("Starting new page: id:" + pageId + ", p:" + pageInWork + ", gp:" + globalPosition + ", s:" + selectPct +", up:" + updatePct + ", pg:" + pgPct); } StringTokenizer st = new StringTokenizer(line, "\t\n\r -"); while (st.hasMoreTokens()) { String word = st.nextToken(); String root = PorterStemmer.stem(word); if (!st.hasMoreTokens()) word += "-"; //end of line int wordId = getWordId(root); globalPosition++; insertIntoPCW(wordId, pageId); insertIntoWI(globalPosition, wordId, prevWord); insertIntoWD(globalPosition, word); prevWord = wordId; } } catch (Exception e) { e.printStackTrace(System.err); } lineOnPage++; try { con.commit(); } catch (Exception e) {} } } public String formatString(String s) { if (s == null) return "null"; StringBuffer buf = new StringBuffer(s); for (int i = 0; i < buf.length(); i++) { if (buf.charAt(i) == '\'') { buf.insert(i, '\''); i++; } } buf.insert(0, '\''); buf.append('\''); return buf.toString(); } private void startTally() { start = System.currentTimeMillis(); } private void endTally(int idx) { long duration = System.currentTimeMillis() - start; tally[idx] += (int) duration; tally[NUM_TALLIES] += (int) duration; } private static SimpleDateFormat df = new SimpleDateFormat("[yyyy-MM-dd HH:mm:ss] "); public static void main(String[] arg) { try { PrintWriter log = new PrintWriter(new FileWriter("addlog.txt")); int i = Integer.parseInt(arg[0]); log.println(df.format(new java.util.Date()) + "Starting run at " + i); log.flush(); BufferedReader r = new BufferedReader(new FileReader("filenames")); String line; ArrayList a = new ArrayList(); AddBook adder = new AddBook(); while ((line = r.readLine()) != null) { if (line.indexOf('|') > 0) { String id = line.substring(0, line.indexOf('|')); a.add(id); } } Iterator itr = a.iterator(); while (itr.hasNext() && i-- > 1) { itr.next(); } int sequence = 1; while (itr.hasNext()) { int id = Integer.parseInt((String) itr.next()); log.println(df.format(new java.util.Date()) + "Starting parse of work " + id + "(" + sequence + ")"); log.flush(); sequence++; adder.parse(id); System.out.println("Finished work : " + id); Thread.sleep(10000); } log.close(); } catch (Exception e) { e.printStackTrace(System.err); } } }