Вы находитесь на странице: 1из 26

-

:
: 424
:

-
2014


......................................................................................................................3
........................................................................................................5
.......................................................................................8
.......................................................................10
......................................................................................................11
.................................................................................................12
.........................................................................................................................13
....................................................................................................14
................................................................................................................15
..........................................................................................15
...............................................................................15
ab initio..............................................................................................15
......................................................................................17
.................................................................................................................18
....................................................................................................19
1.............................................................................................................20
2.............................................................................................................21
3.............................................................................................................22
4.............................................................................................................23
5.............................................................................................................24
6.............................................................................................................25



Sympetrum sanguineum (Ruddy Darter) de
novo. .
.
,
: 300 ,

.
,
, .

, .

,
,
.
2013 6650 ,
(Ladona
fulva).
,
.

,
:
.
,
.
,
.

.
... iPlant
Collaborative.
EMC .
. ...
: " "
3

"" ,

, -
.



. ,
,
(, , )
(). ,
A, C, G, T .
(
, ).
,
. :
ACGG TGCC . -
, . :
ACTTG CAAGT -.
.
, ,
,
. ,

.
, .
, - , , .
(T), (U).
.
.
. -

.
, .
ACGT UGCA.
.
, ,

.

,
, . , ,
(, ) .
5

.
. ,
()
().
, :
.

.
.
,
,
.
,
.
.
.
.
, .
. , UAU
, GGU . -,
-,
,
. ,

. , CUU, CUC, CUA, CUG .
.
UAUGGU ,
.

. :
, , . :
, , .
.

: ( ) (
).
. .
,
6

( , , ) .
,
, .

, -
. -,
,
,
. -,
,
, (),
. ()
.
,
. -
,
.

N .

.

,
,
.
,
.

.
,
.



.
Next Generation Sequence (NGS) .
NGS .
(Sanger)
1
. NGS ,
Sanger. 2011
,
NGS .
Illumina/Solexa
.
70-300 0,050,15$ 1 . 9
. -
,
.
Illumina/Solexa
.
.
.
- (
).
, .
.
.

.
(A,C,G,T) .
,
.

,
.
FASTQ[1]. ,
FASTA. FASTA
:

>

>

...
Illumina/Solexa , , -
.
:
.
.
, ,
.
. ,
- . -
.



Sympetrum sanguineum,
Ladona fulva.
, ,
.

.

.

GenomeSize [2]. 0.4 - 2.2 , ,
1 ~ 978 [3], 0.39 2.15 .
24-28.

.
() - ,

.

. [4] :
0.6-1.5 .
Axeq
Technologies, Macrogen Inc .
:
.

. :

. - 2014
.

10



[5]. .
.
, de novo
, ,
. .
.

,
. MAKER [6] ,
.
MAKER .
SQLite
Perl [ 1].
GFF [7],
, .

11



. ,
,
de
novo.
.
GC- . GC-
G C.

().
,

. (
)
. ,
. QUAKE
[8] .

12


2011 NGS
[9]
, .
, NGS

.

.
, .
Soapdenovo [10],
ABYSS IDBA UD [11] .
,
.
N50, L50,
( ),
.
N50 ,
. ,
. L50 N50.

13


,
. , BLAST [12]
,
. ( )

N.
: ,
47% .
,
- .
, ,
,
,
.
RepeatModeler
[13]. RepeatMasker [14]
RepeatModeler,
. RepeatMasker [ 2].
,
.
de novo .
RepeatMasker,
Dfam [15] RepBase [16].

14



UniProt-KB/Swiss-Prot
[17]. , ,
BLASTX [12].
, [18].
.
E-value 1E-4. E-value

,
.
BLAST. ,
, [ 4].
FASTA
. ID
UniProt-KB/Swiss-Prot. , ,
[ 5].
MAKER
, , .


, .
Trinity [19].
[ 3].
ab initio

.
.
, ab initio

. , MAKER ab initio
, .

15

ab initio ,
MAKER,
,
. ab initio :
SNAP [20], Augustus [21], GeneMark-ES [22].
MAKER GeneMark-ES
,
GFF MAKER.

16


.
CEGMA,
.
( ),
CEGMA.
CEGMA 54% .
gff 51% .
-
.
CEGMA :
,
GNU/Linux .
[ 6].
MAKER .
: 5' ,
,
, ,
, SNAP, , 3'
.
CEGMA. ,
. ,

.

17



L.fulva.

,
,
.

, , ,
,
. .

,
Met , Ladona Fulva
MegaBLAST. Met

. .

18


[1] Cock et al (2009) The Sanger FASTQ file format for sequences with quality scores, and the
Solexa/Illumina FASTQ variants. Nucleic Acids Research, doi:10.1093/nar/gkp1137
[2] Gregory, T.R. (2014). Animal Genome Size Database. http://www.genomesize.com
[3] Dolezel et al.(2003) Cytometry 51A: 127-128
[4] Bunge et al. CotQuest: Improved algorithm and software for nonlinear regression analysis of
DNA reassociation kinetics data. Anal Biochem 2009, 388:322-330.
[5] Mark Yandell, Daniel Ence (2012) A beginners guide to eukaryotic genome annotation. Nature
Reviews Genetics doi:10.1038/nrg3174
[6] Holt C, Yandell M. MAKER2: an annotation pipeline and genome-database management tool
for second-generation genome projects. BMC Bioinfo 2011 Dec 22;12:491
[7] Generic Feature Format Version 3 (GFF3) www.sequenceontology.org/gff3.shtml
[8] Kelley DR, Schatz MC, Salzberg SL. Quake: quality-aware detection and correction of
sequencing errors. Genome Biology 11:R116 2010
[9] Compeau PE, Pevzner PA, Tesler G (2011) How to apply de Bruijn graphs to genome assembly.
Nature Biotechnology 29, 987991. doi:10.1038/nbt.2023
[10] Luo et al.: SOAPdenovo2: an empirically improved memory-efficient short-read de novo
assembler. GigaScience 2012 1:18
[11] Peng, Y., et al. (2012) IDBA-UD: a de novo assembler for single-cell and metagenomic
sequencing data with highly uneven depth, Bioinformatics, 28, 1420-1428.
[12] Gish, W. & States, D.J. (1993) "Identification of protein coding regions by database similarity
search." Nature Genet. 3:266-272. <http://blast.ncbi.nlm.nih.gov/>
[13] Smit, AFA, Hubley, R. RepeatModeler Open-1.0. 2008-2010 <http://www.repeatmasker.org>
[14] Smit, AFA, Hubley, R & Green, P.RepeatMasker Open-3.0. 1996-2010
<http://www.repeatmasker.org>
[15] Travis J. Wheeler, Jody Clements, Sean R. Eddy, Robert Hubley, Thomas A. Jones, Jerzy
Jurka, Arian F. A. Smit, and Robert D. Finn (2012) Dfam: a database of repetitive DNA based
on profile hidden Markov models
[16] Jurka J, Kapitonov VV, Pavlicek A, Klonowski P, Kohany O, Walichiewicz J. (2005) Repbase,
a database of eukaryotic repetitive elements
[17] The UniProt Consortium, Ongoing and future developments at the Universal Protein
Resource, Nucleic Acids Res. 39: D214-D219 (2011)
[18] De Wit P, Pespeni MH, Ladner JT, Barshis DJ, Seneca F, Jaris H, Overgaard Therkildsen N,
Morikawa M and Palumbi SR (2012) The simple fool's guide to population genomics via RNASeq: an introduction to high-throughput sequencing data analysis. Molecular Ecology
Resources 12, 1058-1067
[19] Grabherr MG, Haas BJ, Yassour M, Levin JZ, Thompson DA, Amit I, Adiconis X, Fan L,
Raychowdhury R, Zeng Q, Chen Z, Mauceli E, Hacohen N, Gnirke A, Rhind N, di Palma F,
Birren BW, Nusbaum C, Lindblad-Toh K, Friedman N, Regev A. Full-length transcriptome
assembly from RNA-seq data without a reference genome. Nat Biotechnol. 2011 May
15;29(7):644-52. doi: 10.1038/nbt.1883. PubMed PMID: 21572440
[20] Korf, I. Gene finding in novel genomes. BMC Bioinformatics 5, 59 (2004).
[21] Oliver Keller, Martin Kollmar, Mario Stanke, Stephan Waack (2011) A novel hybrid gene
prediction method employing protein multiple sequence alignments Bioinformatics, doi:
10.1093/bioinformatics/btr010
[22] Lukashin, A. V. & Borodovsky, M. GeneMark.hmm: new solutions for gene finding. Nucleic
Acids Res. 26, 11071115 (1998).
19

1
MAKER ,
. ,
gff .
SQLite
gff . MAKER
:
DBD::SQLite::db selectall_arrayref failed: Expression tree is too large (maximum depth 1000)

SQLite
.
Makefile.PL SQLite,
my @CC_DEFINE '-DSQLITE_MAX_EXPR_DEPTH=0',:
1
2
3
4
5
6
7
8
9
10
11
12
13
14

my @CC_DEFINE = (
# '-DSQLITE_CORE',
'-DSQLITE_ENABLE_FTS3',
# L. Dami 10.07.2010 : now enabling new FTS3 syntax, because
# that's the recommendation from SQLite for new applications
# (used to be "Disabled until we have a test for this").
# This change MAY POSSIBLY BREAK OLD APPLICATIONS THAT ALREADY
# USED FTS3 ... but sooner or later that change had to be done !
'-DSQLITE_ENABLE_FTS3_PARENTHESIS', # for sqlite >= 3.6.10
'-DSQLITE_ENABLE_RTREE',
# for sqlite >= 3.6.10
'-DSQLITE_ENABLE_COLUMN_METADATA',
'-DNDEBUG=1',
'-DSQLITE_MAX_EXPR_DEPTH=0',
);

SQLite , CPAN
look DBD::SQLite

20

2
RepeatMasker ,
.
configure.
:
system( "$pgLocation/makeblastdb -dbtype nucl -in "
. "$rmLocation/Libraries/RepeatMasker.lib > /dev/null 2>&1" );
system( "$pgLocation/makeblastdb -dbtype prot -in "
. "$rmLocation/Libraries/RepeatPeps.lib > /dev/null 2>&1" );

:
system( "$pgLocation/makeblastdb -dbtype nucl -in $rmLocation/Libraries/RepeatMasker.lib -out
$rmLocation/Libraries/RepeatMasker.lib" );
system( "$pgLocation/makeblastdb -dbtype prot -in $rmLocation/Libraries/RepeatPeps.lib -out
$rmLocation/Libraries/RepeatPeps.lib" );

:
system( "$wuLocation/xdformat -n -I "
. "$rmLocation/Libraries/RepeatMasker.lib > /dev/null 2>&1" );
system( "$wuLocation/xdformat -p -I "
. "$rmLocation/Libraries/RepeatPeps.lib > /dev/null 2>&1" );

:
system( "$wuLocation/xdformat -n -I "
. "$rmLocation/Libraries/RepeatMasker.lib > $rmLocation/Libraries/RepeatMasker.lib");
system( "$wuLocation/xdformat -p -I "
. "$rmLocation/Libraries/RepeatPeps.lib > $rmLocation/Libraries/RepeatPeps.lib" );

RepeatMasker.

21

3
java 1.8, Trinity
Trinity.pl:
:
unless ($java_version =~ /(java|openjdk) version \"1\.[67]\./) {
die "Error, Trinity requires access to Java version 1.6 or 1.7. Currently installed version is:
$java_version";
}

:
unless ($java_version =~ /(java|openjdk) version \"1\.[678]\./) {
die "Error, Trinity requires access to Java version 1.6, 1.7 or 1.8. Currently installed version is:
$java_version";
}

22

4

BLAST.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

#!/usr/bin/env python
import sys
#this script takes an -7 text outformat blast output from a blastx against sprot
#and extracts the top uniprot ID for each hit
#usage is parse_blastx2Sprot.py sprotblastoutput.txt outfilename.txt
INFILE = open(sys.argv[1], 'r') #Input file 1 is your blastresults.xml file
evalue = float(sys.argv[2]) #threshold evalue
OUT = open(sys.argv[3], 'w')
queryname=''
for line in INFILE:
line=line.rstrip()
if line[0] == '#':
continue
else:
cols=line.split('\t')
if cols[0] == queryname:
continue
else:
if float(cols[10]) <= evalue: #for parsing based on threshold value
ID=cols[1].split('|')
OUT.write('\n'+cols[0]+'\t'+ID[1])
queryname=cols[0]

23

5
Perl
SwissProt.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

#this script takes text list of proteins (generated by parse_blastx2Sprot.py)


#and create fasta lib.
#usage is perl getProteinsFastaFromID.pl SwissProt.fasta proteins.txt result.fasta
use Bio::Perl;
use strict;
use 5.010;
my $file1 = $ARGV[0];
chomp($file1);
my $file2 = $ARGV[1];
chomp($file2);
open(SPFILE, $file1); #SwissProt library
open(FILE, $file2);
my @lines = <FILE>;
my @plines = <SPFILE>;
my @used = ("justastring");
foreach my $lline(@lines){
chomp($lline);
$lline =~ s/^\S+\s*//;
my $seq_object = get_sequence('swissprot',$lline);
foreach my $pline(@plines) {
if (0<=index($pline,$lline)){
if( !($lline ~~ @used) ){
push @used, $lline;
print $pline;
print $seq_object->seq;
print "\n";
}
last;
}
}
}
close INFILE;
close FILE;

24

6

Genewise. Ubuntu
, Genewise
glib-2
glib-1.
/src
sed -i.old 's/getline/getline_new/' HMMer2/sqio.c
sed -i.old 's/isnumber/isdigit/' models/phasemodel.c

getline
isnumber.

find ./ -type f -name "makefile" -exec sed -i.old 's/glib-config --libs/pkg-config --libs glib-2.0/g'
"{}" +;
find ./ -type f -name "makefile" -exec sed -i.old 's/glib-config --cflags/pkg-config --cflags glib2.0/g' "{}" +;

make glib1
glib-2.0 .
.
CEGMA :
Use of uninitialized value in numeric eq (==) at
~/Dragonfly/tools/cegma_v2.4.010312/lib/geneid.pm line 266.
some values in Markov model with zero counts, use pseudocounts at
~/Dragonfly/tools/cegma_v2.4.010312/lib/geneid.pm line 272.


, .
, /lib/geneid.pm
markov_model :

25

1
2
3
4
5
6
7
8
9
10
11
12
13
14

sub markov_model {
my ($order, $seqs, $pseudo, $frame) = @_;
#0.2 instead of 0
$pseudo = 0.2 unless defined $pseudo;
my @alph = qw(A C G T);
my $table = geneid::dna_word_table($order, "{}");
foreach my $prefix (keys %$table) {
foreach my $nt (@alph) {
for (my $fr = 0; $fr <= $frame; $fr++){
$table->{$prefix}{$nt}{$fr} = $pseudo;
}
}
}

26