WORKED EXAMPLE using GMOD Chado database in Argos system for Daphnia genome project Don Gilbert, 15 Feb 2004 software: gmod release 0.001 with patches additional perl modules written for sequence load,dump configured to work in Argos genome system, using its Postgres DB, Perl modules ========================================================== ========================================================== STEP 0 .... Initialize Argos/GMOD environment ========================================================== #n need Argos ENVIRON settings to use databases from shell % bin/argos-env -dump -key PERL,ARGOS,PG,DAPHNIA #n these ones are essential setenv ARGOS_ROOT "/bio/biodb" setenv PERL5LIB "$ARGOS_ROOT/common/perl/lib:$ARGOS_ROOT/common/system-local/perl/lib" setenv PGPORT "7302" setenv PGDATA "/bio/biodb/ROOT/indices/pgsql" setenv DAPHNIA_PGDATA "/bio/biodb/daphnia/indices/pgsql" # set them in shell -- tcsh/csh % source `bin/argos-env -key PERL,ARGOS,PG,DAPHNIA` # bash/sh . `bin/argos-env -key PERL,ARGOS,PG,DAPHNIA` #n check Postgres server, start if needed % $ARGOS_ROOT/ROOT/bin/run-postgres2 #n Set path to include postgresql/bin (tcsh) % set path=(/bio/biodb/common/servers/postgresql/bin $path) #n Set shared lib path to use (tcsh) % setenv LD_LIBRARY_PATH ${LD_LIBRARY_PATH}:$ARGOS_ROOT/common/system-local/lib:$ARGOS_ROOT/common/servers/postgresql/lib #n check postgres is responding % psql -l List of databases Name | Owner | Encoding -----------+----------+----------- chado001 | gilbertd | SQL_ASCII chadoxA | gilbertd | SQL_ASCII template0 | gilbertd | SQL_ASCII template1 | gilbertd | SQL_ASCII testdb | gilbertd | SQL_ASCII #n Check Argos location of GMOD tools % ls $ARGOS_ROOT/gmod/ bin/ common@ data/ indices/ lib/ tmp/ webapps/ cgi-bin/ conf/ doc/ install/ modules/ web/ bin/ -- has perl scripts for this example conf/ -- has gmod.conf with CHADO_DB parameters for this system lib/ -- has perl modules (also/or in $ARGOS_ROOT/common/perl/lib) data/ontologies/ -- ontology data install/initialize.sql -- database init. NOTE: gmod.conf should be configured to match Postgres DB settings (ROOT/bin/run-postgres2 lists these) #n Check for ontologies % du $ARGOS_ROOT/gmod/data/ontologies 10345 gmod/data/ontologies/go -- can load another time 297 gmod/data/ontologies/song -- need 6 gmod/data/ontologies/obo_rel -- need #n Check or create Argos location for Daphnia database % ls $ARGOS_ROOT/daphnia/ bin/ daphnia_db.check docs/ shared@ webapps/ cgi-bin/ data/ indices/ temp@ work/ common@ datagen/ lib/ tmp/ conf/ dbs/ logs@ web/ bin/ -- has symlinks to gmod/bin/ conf/ -- has gmod.conf for daphnia db settings (overrides gmod/conf/) data/ -- has sequences indices/pgsql/ -- postgres db indices for this database will be located here (configured in daphnia/conf/gmod.conf) NOTE: daphnia/conf/gmod.conf should be configured with project-specific settings ========================================================== STEP 1 .... Make daphnia Chado DB ========================================================== #n tools for loading and dumping miscellany sequences. These all respond to program -help command. See GMOD::Chado::SeqUtils.pm for common routines. gmod_init_db.pl -- initialize a new database, adding organisms, intialize.sql, ontology data sets. gmod_load_newseq.pl -- add miscellaneous organism sequences, cDNA, EST, microsatellites, etc. not located on genome. Optionally generate PublicID for these. gmod_dump_seq.pl -- output sequences selected by organism, publication (input file), seq type. gmod_list_db.pl -- show feature statistics for chado db: # per organism, per seq type, per publication/infile, and checksum test for sequence duplications. % cd $ARGOS_ROOT/daphnia/ -- or -- cd $ARGOS_ROOT/$ARGOS_SERVICE/ % bin/gmod_init_db.pl % bin/gmod_init_db.pl -dbname daphnia \ -org='waterflea,Daphnia pulex' \ -org='waterflea,Daphnia magna' \ -org='waterflea,Daphnia pulicaria' \ -ontology=obo_rel,song A database called 'daphnia' already exists. OK to drop database 'daphnia'? [yes/NO] yes Dropping database 'daphnia' DROP DATABASE Creating new database called 'daphnia' CREATE DATABASE Creating tables psql:/bio/biodb/gmod/modules/complete.sql:1424: ERROR: Type "gffatts" does not exist psql:/bio/biodb/gmod/modules/complete.sql:1510: ERROR: language "plpgsql" does not exist Database 'daphnia' created Loading initial sql: /bio/biodb/gmod/install/initialize.sql psql:/bio/biodb/gmod/install/initialize.sql:52: ERROR: Relation "array" does not exist psql:/bio/biodb/gmod/install/initialize.sql:53: ERROR: Relation "array" does not exist psql:/bio/biodb/gmod/install/initialize.sql:54: ERROR: Relation "array" does not exist psql:/bio/biodb/gmod/install/initialize.sql:55: ERROR: Relation "array" does not exist insert into organism (abbreviation, genus, species, common_name) values('D.pulex','Daphnia','pulex','waterflea'); INSERT 288509 1 insert into organism (abbreviation, genus, species, common_name) values('D.magna','Daphnia','magna','waterflea'); INSERT 288510 1 insert into organism (abbreviation, genus, species, common_name) values('D.pulicaria','Daphnia','pulicaria','waterflea'); INSERT 288511 1 Loading ontology: /bio/biodb/gmod/data/ontologies/obo_rel/rel.ontology Chado::LoadDBI(Main,dbi:Pg:dbname=daphnia;port=7302;host=localhost,,passwd) Loading ontology: /bio/biodb/gmod/data/ontologies/song/so.ontology Chado::LoadDBI(Main,dbi:Pg:dbname=daphnia;port=7302;host=localhost,,passwd) Database 'daphnia' initialized. ========================================================== STEP 2 .... Load each data sequence file ========================================================== #n sequences to be loaded in daphnia/data/ 1214424 Feb 12 21:30 CGBvntr.fa 191671 Jan 30 10:08 cDNA1.fa 373273 Jan 30 10:08 cDNA2.fa 236326 Feb 12 21:29 est1.fa 404562 Feb 12 21:29 est2.fa 660425 Jan 30 10:09 microDNA.fa 1143886 Jan 30 10:09 microsats.fa #n load each file to database #n use parameters for -organism, seq -type, -idmake prefix (generate PublicIDS) % bin/gmod_load_newseq.pl -org="D.pulex" -in=data/CGBvntr.fa -type=cDNA --idmake="WFcl" Loading sequences to database. Working with Daphnia pulex. These terms from Sequence Ontology match 'cDNA'. cDNA_match cDNA_clone chimeric_cDNA_clone genomically_contaminated_cDNA_clone genomic_polyA_primed_cDNA_clone partially_unprocessed_cDNA_clone CDS_supported_by_EST_or_cDNA_data Choose Sequence type? cDNA_clone Working with Daphnia pulicaria. .................................................. 50 WFcl0000050 .................................................. 100 WFcl0000100 --- .................................................. 1450 WFcl0001450 ............................................. 1495 sequences added 0 duplicate sequences skipped Done % bin/gmod_load_newseq.pl -org="D.pulex" -in=data/cdna/cDNA1.fa -type=cDNA_clone -idmake="WFcl" Loading sequences to database. Working with Daphnia pulex. .................................................. 50 WFcl0001545 --- .................................................. 350 WFcl0001845 ............................................... 397 sequences added 0 duplicate sequences skipped Done % bin/gmod_load_newseq.pl -org="D.pulex" -in=data/cdna/cDNA2.fa -type=cDNA_clone -idmake="WFcl" Loading sequences to database. Working with Daphnia pulex. .................................................. 50 WFcl0001942 --- .................................................. 600 WFcl0002492 ................... 619 sequences added 0 duplicate sequences skipped Done % bin/gmod_load_newseq.pl -org="D.pulex" -in=data/est1.fa -type=EST -idmake="WFes" Loading sequences to database. Working with Daphnia pulex. .................................----.--------.--. 36 WFes0000036 .-.---.-.--.-...................-................. 77 WFes0000077 ...-........-...-................................. 124 WFes0000124 ..........-.-..-....-......-.-........--.......... 166 WFes0000166 ............-.........................-......-...- 212 WFes0000213 -...-...................-............-----.......- 253 WFes0000254 ................-.......-..-...-.................- 298 WFes0000299 ...........................-........-.......... 343 sequences added 54 duplicate sequences skipped new=WFBid=1529|clone=P3-G4|taxon=D.pulex|strain=WindsorPond,Ontario|library=Guelphest|date=Jan2004|note1=|contact=Tcrease| old=>WFcl0001529 len=718;type=cDNA_clone;synonym=P3-G4 new=WFBid=1530|clone=P3-E11|taxon=D.pulex|strain=WindsorPond,Ontario|library=Guelphest|date=Jan2004|note1=|contact=Tcrease| old=>WFcl0001530 len=700;type=cDNA_clone;synonym=P3-E11 new=WFBid=1531|clone=P1-H1|taxon=D.pulex|strain=WindsorPond,Ontario|library=Guelphest|date=Jan2004|note1=|contact=Tcrease| old=>WFcl0001531 len=699;type=cDNA_clone;synonym=P1-H1 --- Done #n NOTE: the loader checks for duplicates when generating IDs by checksum, seq length and synonyms/names. #n It will retain duplicates if synonyms don't match, but skip where these three do match. % bin/gmod_load_newseq.pl -org="D.pulex" -in=data/est2.fa -type=EST -idmake="WFes" Loading sequences to database. Working with Daphnia pulex. The organism 'D.arenata' could not be found. .The organism 'D.arenata' could not be found. #n use gmod_init_db to add this new organism % bin/gmod_init_db.pl -init=0 -org='waterflea,Daphnia arenata' A database called 'daphnia' already exists. OK to drop database 'daphnia'? [yes/NO] Will not drop database 'daphnia'. Skipping on to other data. insert into organism (abbreviation, genus, species, common_name) values('D.arenata','Daphnia','arenata','waterflea'); INSERT 318698 1 Database 'daphnia' initialized. #n and start again % bin/gmod_load_newseq.pl -org="D.pulex" -in=data/est2.fa -type=EST -idmake="WFes" Loading sequences to database. Working with Daphnia pulex. It appears that you have already loaded this exact file Do you want to continue [no]? yes Working with Daphnia arenata. .................................................. 50 WFes0000393 --- .................................................. 600 WFes0000943 ................... 619 sequences added 0 duplicate sequences skipped Done #n -- skip data/microsat/microDNA.fa? #n -- is data/microsat/microsats.fa a superset w/ diff names? % bin/gmod_load_newseq.pl -org="D.pulex" -in=data/microsat/microDNA.fa -type=microsatellite -idmake="WFms" Loading sequences to database. Working with Daphnia pulex. .................................................. 50 WFms0000050 --- .................................................. 850 WFms0000850 ........ 858 sequences added 0 duplicate sequences skipped Done % bin/gmod_load_newseq.pl -org="D.pulex" -in=data/microsat/microsats.fa -type=microsatellite -idmake="WFms" Loading sequences to database. Working with Daphnia pulex. --...-----...----...-.--..--..----..--..---.--.--- 20 WFms0000879 ...-...-..-.---.-.--..-.-.-.-..-.----....--.--..-- 46 WFms0000905 --- -----..-.-.-...-.--..--....--...-..-.-.-.....-.... 593 WFms0001451 ...--..-.---..----.---.--------------------------. 604 WFms0001462 -.----. 606 sequences added 851 duplicate sequences skipped new=P1-11T7 old=>WFms0000001 len=635;type=microsatellite;synonym=p1-11t7 new=P1-14T7 old=>WFms0000002 len=725;type=microsatellite;synonym=p1-14t7 --- new=ST9-T7 old=>WFms0000851 len=476;type=microsatellite;synonym=st9-t7 Done ========================================================== STEP 3 .... Check database ========================================================== % bin/gmod_list_db.pl -check -v > data/daphnia_db.check Feature summary for Chado database ============================================================ Features by Chado::Organism n=13 2839 Daphnia pulex/D.pulex/waterflea 1495 Daphnia pulicaria/D.pulicaria/waterflea 619 Daphnia arenata/D.arenata/waterflea ------------------------------------------------------------ Features by Chado::Pub n=8 1495 data/CGBvntr.fa 1076639416 type=seq_file 397 data/cDNA1.fa 1075475331 type=seq_file 619 data/cDNA2.fa 1075475336 type=seq_file 343 data/est1.fa 1076639361 type=seq_file 634 data/est2.fa 1076639394 type=seq_file 858 data/microDNA.fa 1075475373 type=seq_file 606 data/microsats.fa 1075475397 type=seq_file ------------------------------------------------------------ Features by Chado::Cv Sequence Ontology, n=897 978 EST 2511 cDNA_clone 1464 microsatellite ------------------------------------------------------------ Public ID counter ID_Tag Last_ID Description WFms 1464 id counter for microsatellite by bin/gmod_load_newseq.pl WFes 962 id counter for EST by bin/gmod_load_newseq.pl WFcl 2511 id counter for cDNA_clone by bin/gmod_load_newseq.pl ------------------------------------------------------------ Chado::Feature total=4953 #n NOTE: Features by Cv (978 EST) doesn't match Public ID counter (WFes 962) #n This was from bug in loader when we aborted for missing D.arenata organism. #n Fixed ; now each ID is checked before adding. Duplicate checksums Name____ Length Seq_type Synonym Feat_id Publication Checksum WFcl0000417 894 cDNA_clone P2-H82000FW52945 417 'data/CGBvntr.fa 1076639416' d30251dbae76266f80c384971ad71516 WFms0001051 894 microsatellite P2-H82000FW52945 4540 'data/microsats.fa 1075475397' d30251dbae76266f80c384971ad71516 WFcl0001328 763 cDNA_clone PB96-H022000FW3761138 1328 'data/CGBvntr.fa 1076639416' 3c6e8345eac1a34cef1f42209924cd44 WFms0000775 763 microsatellite PB96-H022000FW3761138 4264 'data/microDNA.fa 1075475373' 3c6e8345eac1a34cef1f42209924cd44 WFcl0000536 473 cDNA_clone P3-A192000FW52524 536 'data/CGBvntr.fa 1076639416' c81db7dcb50d624c5d7c1a46a8be3b9f WFms0000290 473 microsatellite P3-A192000FW52524 3779 'data/microDNA.fa 1075475373' c81db7dcb50d624c5d7c1a46a8be3b9f .... 3200 dup seq checksums ------------------------------------------------------------ ============================================================ Done ========================================================== STEP 4 .... Dump out sequences with Public IDs ========================================================== #n can select by organism, publication (file), seq type. #n only fasta format now provided. % bin/gmod_dump_seq.pl -type=EST -out=data/daphnia_EST.fa 978 features dumped Done % bin/gmod_dump_seq.pl -type=cDNA_clone -out=data/daphnia_cDNA_clone.fa 2511 features dumped Done % bin/gmod_dump_seq.pl -type=microsatellite -out=data/daphnia_microsatellite.fa 1464 features dumped Done #n Formatted deflines with database info are like this. #n use -checksum flag to add checksum to defline >WFes0000300 len=166;type=EST;synonym=3C-G06,WFBid1847;contact=Tcrease;library=Guelphest ;date=Jan2004;taxon=D.pulex;clone=3C-G06;strain=WindsorPond,Ontario GTTGGCTGGAGGACCGGCGCTGGCCAGTNGACTCTAGACTCGAGCAGCTT ATGCATGCGGCCGCAATTCGAGCTCACTTGGCCAATTCGCCCTATAGTGA GTCTGTATTACAATTCACTGGCCGTCGTTTTACAACGTCGTGACTGGGAA AACCCTGGCGGTGTAG >WFes0000499 len=183;type=EST;synonym=070803d_E01_d_007,WFBid2048;contact=JColbourne;lib rary=HCGSest1;date=Jan2004;taxon=D.arenata;clone=070803d_E01_d_007;strain=Log52,Oregon ACACAGNATAATCTGTTGTATTTTTATTATCAATTTAGATTCTAGTATTG AAAGAGGAAAGTGAGAACTGGCGCAATACTTTTTTTTGTCCGATTTGTCA ATTAACTTTACATTTTTGGAAAAAGATTAAAAAAATACAAAGAATGTTCT AGTACAAAAAAAAAAAAAAAAAAAAAAAAAAAA