<!ELEMENT game ANY>
<!-- ************************************************************* -->
<!-- GAME Genome Annotation Markup Elements -->
<!-- Document Type Definition DTD - June. 5,1999 - Version 1.0 -->
<!--
Suzanna E.
Lewis
-->
<!--
Erwin
Frise
-->
<!-- University of California Berkeley -->
<!-- $Id: game.dtd.html,v 1.1 2000/03/07 19:54:51 bradmars Exp $ -->
<!-- Annotations are a summarization of all the collected features
discerned
and described on related sequences of genomic DNA, transcripts,
mRNAs (and
cDNAs which are treated as their logical equivalent), and
proteins. Each of
these molecules has regions along their linear length
described by
annotators as 'features'. The features themselves are a
combined summary of
both computational and genetic analysis of that DNA,
RNA, or AA sequence.
Computational analyses are not considered 'features'
and are treated as
primary data, as are any experimental analyses carried
out at the bench. In
other words, analytical results may be used to identify
features, but are
not considered features on their own in this context.
Thus, each molecule is
described both in terms of primary analytical results
and in terms of expert
defined features that are supported by the preceding
results. The
combination of all these associated feature descriptions on the
related
molecules (from genomic to protein) constitute a statement that is
called an
annotation.
-->
<!-- '' == ONE, NO MORE NO LESS -->
<!-- '?' == ZERO OR ONE.
-->
<!-- '*' == ZERO OR MORE. -->
<!-- '+' == ONE OR MORE.
-->
<!-- General purpose entities and elements that are used in
mulitiple elements -->
<!ENTITY % integer
"NMTOKEN">
<!ELEMENT type (#PCDATA)>
<!ELEMENT value
(#PCDATA)>
<!ELEMENT name (#PCDATA)>
<!ELEMENT synonym
(#PCDATA)>
<!ELEMENT term (#PCDATA)>
<!ELEMENT program
(#PCDATA)>
<!ELEMENT version (#PCDATA)>
<!ELEMENT score
(#PCDATA)>
<!-- ISO date format -->
<!ELEMENT creation_date
(#PCDATA)>
<!ELEMENT date (#PCDATA)>
<!-- for comments and other free text -->
<!ELEMENT description
(#PCDATA)>
<!-- DNA, RNA, AA -->
<!ELEMENT residues (#PCDATA)>
<!ELEMENT alignment (#PCDATA)>
<!ELEMENT parameter (type,
value)>
<!ELEMENT output (type, value)>
<!ELEMENT parent
(type, value)>
<!-- these must be integers -->
<!ELEMENT offset (#PCDATA)>
<!ELEMENT length (#PCDATA)>
<!ELEMENT from (#PCDATA)>
<!ELEMENT to (#PCDATA)>
<!ELEMENT start (#PCDATA)>
<!ELEMENT end (#PCDATA)>
<!-- things to describe where the sequence came from -->
<!ELEMENT species (#PCDATA)>
<!ELEMENT tissue (#PCDATA)>
<!ELEMENT stage (#PCDATA)>
<!ELEMENT project (#PCDATA)>
<!-- The entity 'site_operator' is specific to fuzzy site, start,
and end elements -->
<!ENTITY %
site_operator " site_operator (less_than | greater_than)">
<!ELEMENT fuzzy_start (span)>
<!ATTLIST
start
%site_operator; #IMPLIED
>
<!ELEMENT fuzzy_end (span)>
<!ATTLIST
end
%site_operator; #IMPLIED
>
<!ELEMENT fuzzy_span (fuzzy_start, fuzzy_end)>
<!-- either_dir attribute is because some features do not have
an orientation associated with them, but apply
equally well
to either strand -->
<!--
The between attribute is used to indicate a position between 2
bases (or more generally between 2 sites?).
It is important to note that this flag is preferred
to a
length of zero. The problem with a 0 length
span is that it
is not possible to tell if one
means before or after the
current base -->
<!ELEMENT span (start, end)>
<!ATTLIST
span
between (TRUE) #IMPLIED
either_dir (TRUE) #IMPLIED
>
<!-- SOME EXAMPLES
I've taken the locations from the
descriptions in the GB Feature Table
Definition.
For descriptions of what the feature spans mean, see:
http://www.ncbi.nlm.nih.gov/collab/FT/components.html#location_descriptors
(David Emmert, Harvard)
- Location: 467
<span>
<offset>466</offset>
<length>1</length>
</span>
- Location: 340..565
<span>
<offset>329</offset>
<length>225</length>
</span>
- Location: <345..500
<fuzzy_span>
<fuzzy_start
site_operator="less_than">
<span>
<offset>344</offset>
<length>1</length>
</span>
</fuzzy_start>
<fuzzy_end>
<span>
<offset>499</offset>
<length>1</length>
</span>
</fuzzy_end>
</fuzzy_span>
- Location: (102.110)
<span>
<offset>101</offset>
<length>8</length>
</span>
- Location: (23.45)..600
<fuzzy_span>
<fuzzy_start">
<span>
<offset>22</offset>
<length>22</length>
</span>
</fuzzy_start>
<fuzzy_end>
<span>
<offset>599</offset>
<length>1</length>
</span>
</fuzzy_end>
</fuzzy_span>
- Location: (122.133)..(204.221)
<fuzzy_span>
<fuzzy_start">
<span>
<offset>121</offset>
<length>11</length>
</span>
</fuzzy_start>
<fuzzy_end>
<span>
<offset>203</offset>
<length>17</length>
</span>
</fuzzy_end>
</fuzzy_span>
- Location: 123^124
<span
between="TRUE">
<offset>122</offset>
<length>1</length>
</span>
-->
<!-- Annotation sub-elements. -->
<!-- NAME: the official (by
someone's standard) symbol to use -->
<!-- DBXREF: The database
cross-reference element refers to a
database where the annotation is
generated and maintained. -->
<!-- GENE: Can't seem to avoid the
nefarious gene concept. There are
different relationship a gene can have to
the annotation element.
One is a positive identification (or assignment) to
a gene. The alternate to
this is a list of known genes (from traditional
genetic analysis) any of
which are possible candidates for assigning to this
annotation. Both of
these assignment elements are naturally supposed to be
within the same
species. Relationships to other genes (either within or in
other species) is
indicated by enclosing zero or more related gene elements.
The specifics of
the type of relationship is held within the sub-element.
-->
<!-- DESCRIPTION: is a comment, a free text field for the curators
to jot down
any additional information. -->
<!-- FEATURE_SET: to
make it possible to set this up in an analogous manner
to computational_analysis and result_set -->
<!-- SEQ: what sequence this annotation applies to -->
<!--
Annotation attributes. -->
<!-- The id is a unique identifier for
other elements to use
when referencing this
annotation. -->
<!ELEMENT annotation (name?, dbxref?, gene*, aspect*, description?,
feature_set*, seq?)>
<!ATTLIST annotation
id
ID #REQUIRED
seq IDREF #IMPLIED
>
<!ELEMENT aspect (dbxref, (function | process | cellular_component))>
<!ELEMENT function (#PCDATA)>
<!ELEMENT process (#PCDATA)>
<!ELEMENT cellular_component (#PCDATA)>
<!-- Obviously there are other sorts of gene to gene relationships and
these
still need to be added -->
<!ENTITY % association "association (HOMOLOG|ORTHOLOG|PARALOG|IS|MAY_BE)">
<!ELEMENT gene (dbxref, name?, synonym?, species?, description?)>
<!ATTLIST gene
%association; #REQUIRED
annotation
IDREF #IMPLIED
>
<!-- seq sub-elements. -->
<!-- The seq element represents the
different DNA, RNA, and AA molecules. -->
<!-- The database cross-reference refers to a sequence database like
genbank
or embl (only for genomic and cDNAs of course). -->
<!-- A
single origin/source is requested to indicate the derivation of the
primary
sequence (this is basically clone information for genomic and cDNA
data).
-->
<!-- The residues are always optional for any of these. -->
<!-- seq element IDs are used to support derivation between seq elements
-->
<!-- Molecular element attributes. -->
<!-- Each has a unique
identifier for other elements to use when referencing
this sequence
molecule. It may also act as a label in displays.
Because the DNA, RNA, or
AA residue elements are optional a length attribute
is required. The length
provides the extent of the number line along which
the features and analysis
are positioned -->
<!-- These aspects are associated with an
individual sequence
and not the annotation because
a single annotation may
describe the differnet gene
products that arise from
the same region of the
genome -->
<!ENTITY % maturity "maturity (primary | processed | pro | pre-pro |
pre-pro-pro )">
<!ENTITY % transcript_function "transcript_function
(mRNA | rRNA | snoRNA | snRNA | tRNA | trans_spliced_leader)">
<!ENTITY % immigrant "immigrant (transposon | pseudogene | mobile_intron
| virus | plasmid)">
<!-- do we really want CDS? it seems redundant --
right-o its gone, also chucked cDNA -->
<!ENTITY % seq_type "type (AA | RNA | DNA)">
<!ELEMENT seq (name?, dbxref*, map_position*, source?, project?, clone*,
description?, residues?, parent*)>
<!ATTLIST seq
id
ID #REQUIRED
%seq_type;
#REQUIRED
produced_by IDREF
#IMPLIED
length
%integer; #IMPLIED
%maturity;
#IMPLIED
%transcript_function; #IMPLIED
%immigrant;
#IMPLIED
>
<!ELEMENT seq_relationship ((span | fuzzy_span), alignment?)>
<!ATTLIST seq_relationship
seq IDREF
#IMPLIED
type (query | subject | peer | subseq) #IMPLIED
>
<!ENTITY % map_type "type (cytological | linear | ordering)">
<!ELEMENT map_position (map, span?)>
<!ATTLIST
map_position
%map_type;
#REQUIRED
seq IDREF #IMPLIED
>
<!-- an example of a mapping
<map_position type=cytological>
<map>2R</map>
<span>
<start>35A</start>
<end>35B</end>
</span>
</map_position>
-->
<!-- ordering of the 'exons' is implied the ordering of the
features in this set -->
<!-- VERSION: as
the annotation progresses versions are maintained -->
<!-- AUTHOR:
who/what created this annotation -->
<!-- DATE: date this annotation
was first created -->
<!ELEMENT feature_set (name?, type?,
seq_relationship*, author?, creation_date?, version?, evidence*, parent*,
description?, feature_span*, seq?)>
<!ATTLIST feature_set
id
ID #REQUIRED
annotation
IDREF #IMPLIED
produces_seq
IDREF #IMPLIED
>
<!-- A 'feature' is defined by 3 things: a type,
an interval (start and end) to place it on the
molecule in question,
and the results that support
this designation. -->
<!ELEMENT feature_span (type?,
seq_relationship*, evidence*, tag_residues?)>
<!ELEMENT computational_analysis (type?, database?, program, date?,
version?,
parameter*, result_set*)>
<!ATTLIST computational_analysis
seq IDREF
#IMPLIED
>
<!ELEMENT result_set (score?, seq_relationship*, dbxref?, output*, result_span*, parent*)>
<!ELEMENT result_span (score?, type?, seq_relationship+, output*)>
<!ATTLIST result_span
id
ID #IMPLIED
>
<!ELEMENT tag_residues (residues)>
<!ATTLIST tag_residues
offset %integer; #REQUIRED
>
<!ELEMENT evidence (dbxref?, description?)>
<!ATTLIST
evidence
type CDATA #IMPLIED
result IDREF #IMPLIED
>
<!-- IMPORTANT
the element that the evidence
result attribut refers to may either
be a seq, a
seq, a computational_analysis or a result_span
depending upon whether or not the computed results are actually
available within the xml document (a result_span).
If not directly
provide this provides a mechanism
to indicate how those results
can be regenerated. A
seq element id means that there is an alignment.
A
computational_analysis indicates what program to run and how.
A result_span element means that the program has
already been
run and the results are availabe
within the current dtd.
-->
<!ELEMENT experimental_analysis (experimental_conclusion+, citation,
description?)>
<!ATTLIST experimental_analysis
seq IDREF
#REQUIRED
>
<!ELEMENT experimental_conclusion (#PCDATA)>
<!ATTLIST
experimental_conclusion
id ID #IMPLIED
>
<!ELEMENT database (name, date?, version?)>
<!ELEMENT dbxref (xref_db, xref_db_id?)>
<!ELEMENT xref_db
(#PCDATA)>
<!ELEMENT xref_db_id (#PCDATA)>
<!-- Everything below this point is very sketchy, so don't
jump to any conclusions from what follows -->
<!-- Species is mandatory, it is the origin of the sequence -->
<!-- project is who generated this sequence (but not necessarily the
associated features and analyses. -->
<!ELEMENT source (species?, tissue?, stage?)>
<!-- A database cross reference for the clone itself
Text content for
say, ordering information? -->
<!ELEMENT clone (dbxref+, span?, description?)>
<!-- use the dublin core here?? -->
<!ENTITY % pub_type "type
(Journal | Personal_communication | Proceedings | Book)">
<!ELEMENT
citation (dbxref?, title, journal?, date, author*, volume?, pages?)>
<!ATTLIST citation
%pub_type; #REQUIRED
>
<!ELEMENT title (#PCDATA)>
<!ELEMENT author (#PCDATA)>
<!ELEMENT volume (#PCDATA)>
<!ELEMENT pages (#PCDATA)>
<!ELEMENT journal (#PCDATA)>