Caricare il vocabolario MeSH all’interno di un database relazionale MySQL con uno script Java


Il programmino qui sotto carica i termini MeSH (ed in particolare il valore del tag “/DescriptorRecordSet > DescriptorRecord > DescriptorName > String“) dal file distribuito dal NML (http://www.nlm.nih.gov/mesh/filelist.html) in un database relazionale MySQL.

Scarica lo script oppure dagli un’occhiata (nel codice ci sono dei commenti per le personalizzazioni):

/**
 * @author Arjuna Del Toso
 * @url http://arjuna.deltoso.net/
 * 
 * @FAQ: what the hell is this?

 * A simple, very simple, program to load the MESH Terms to a MySQL database
 * 
 * @USAGE
 * 1-configure some stuff in the code ("//: CHANGE A VALUE HERE" string is a placeholder for what to change);
 * 2-install the required packages (jdbc connector [http://www.mysql.com/products/connector/j/] and dom4j [http://www.dom4j.org/]);
 * 3-compile it ($javac mesh2database.java);

 * 4-put the MESH Descriptor XML file and the MESH Descriptor DTD file in the current directory;
 * 5-run the program ($java mesh2database) and wait;
 * 6-enjoy the 20.000+ records inserted in your database.
 * 
 */

// Some imports in order to run this program; you need

// the jdbc driver for connecting to MySQL database
// and the dom4j package for XML manipulation
import java.io.File;
import org.dom4j.Document;

import org.dom4j.DocumentException;

import org.dom4j.Element;
import org.dom4j.ElementHandler;
import org.dom4j.ElementPath;

import org.dom4j.io.SAXReader;

import java.sql.*;

// a very fanciful name
public class mesh2database {

	// all work done here
	public static void main(String[] args) throws SQLException {

		// load the jdbc driver

		try {
			Class.forName("com.mysql.jdbc.Driver").newInstance();
		} catch (InstantiationException e1) {
		} catch (IllegalAccessException e1) {
		} catch (ClassNotFoundException e1) {}

		// the connection to mysql db

//: CHANGE A VALUE HERE
//: set your database values
		final Connection con = DriverManager.getConnection("jdbc:mysql://localhost/table", "user", "password");
		// a preparedStatement in the hope to improve speed

//: CHANGE A VALUE HERE
//: configure to match your database schema
		final PreparedStatement s = con.prepareStatement("INSERT IGNORE INTO `table`.`mesh` (`id` , `mesh` ) VALUES (NULL , ?)");

		// the MESH Descriptors XML file,
		// can be found at http://www.nlm.nih.gov/mesh/

//: CHANGE A VALUE HERE
//: insert the path to your mesh file
		File file = new File("desc2007");

		// enable pruning mode to call me back as each node is complete

		SAXReader reader = new SAXReader(false);

		reader.addHandler( "/DescriptorRecordSet/DescriptorRecord/DescriptorName/String",
				new ElementHandler() {
			public void onStart(ElementPath path) {}
			public void onEnd(ElementPath path) {
				// process a single element

				Element meshDescriptorValue = path.getCurrent();
				Element descname = meshDescriptorValue.getParent();
				Element descrecord = descname.getParent();

				// remove .toLowerCase() if you are Case Sensitive ;) 

				String meshHeading = meshDescriptorValue.getText().toLowerCase();

				// i just don't want something like
				// "DNA Degradation, Apoptotic"
				// as single record in my database
				if(!meshHeading.contains(",")){

					// simple debug code (outputs the MESH DESCRIPTORS) 

					System.out.println(meshHeading);

					// the database INSERTs executed here
					try {
						// insert current value in the preparedStatement
						s.setString(1, meshHeading);
						// execute the statement

						s.executeUpdate();
					} catch (SQLException e1) {}
				}
				// prune the tree
				descrecord.detach();
			}
		}
		);

		// just reading the file will do all the work
		try {
			Document document = reader.read(file);
		} catch (DocumentException e) {}

		// closing statement and connection

		s.close();
		con.close();
	}

}

Grossomodo per eseguire il programmino dovrebbero bastarti un compilatore Java, il file XML con il vocabolario MeSH, un database MySQL e le due librerie Java MySQL Connector/J e dom4j.



Leave a Reply