From 4aa3d84a12d56c2f0eb3ead1aebb0f0bc4f9992b Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Tue, 7 Jun 2022 18:43:16 +0200 Subject: [PATCH 01/20] add _TAGS --- bigbio/biodatasets/an_em/an_em.py | 3 ++- bigbio/biodatasets/anat_em/anat_em.py | 3 ++- bigbio/biodatasets/ask_a_patient/ask_a_patient.py | 3 ++- bigbio/biodatasets/bc5cdr/bc5cdr.py | 3 ++- bigbio/biodatasets/bc7_litcovid/bc7_litcovid.py | 3 ++- bigbio/biodatasets/bio_sim_verb/bio_sim_verb.py | 3 ++- bigbio/biodatasets/bio_simlex/bio_simlex.py | 3 ++- .../bioasq_2021_mesinesp/bioasq_2021_mesinesp.py | 3 ++- bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py | 3 ++- .../bioasq_task_c_2017/bioasq_task_c_2017.py | 3 ++- bigbio/biodatasets/bioinfer/bioinfer.py | 3 ++- .../biology_how_why_corpus/biology_how_why_corpus.py | 3 ++- bigbio/biodatasets/biomrc/biomrc.py | 3 ++- .../bionlp_shared_task_2009.py | 7 ++++++- .../bionlp_st_2011_epi/bionlp_st_2011_epi.py | 3 ++- .../bionlp_st_2011_ge/bionlp_st_2011_ge.py | 3 ++- .../bionlp_st_2011_id/bionlp_st_2011_id.py | 10 +++++++++- .../bionlp_st_2011_rel/bionlp_st_2011_rel.py | 3 ++- .../bionlp_st_2013_cg/bionlp_st_2013_cg.py | 12 +++++++++++- .../bionlp_st_2013_ge/bionlp_st_2013_ge.py | 3 ++- .../bionlp_st_2013_gro/bionlp_st_2013_gro.py | 3 ++- .../bionlp_st_2013_pc/bionlp_st_2013_pc.py | 3 ++- .../bionlp_st_2019_bb/bionlp_st_2019_bb.py | 3 ++- bigbio/biodatasets/biored/biored.py | 3 ++- bigbio/biodatasets/biorelex/biorelex.py | 3 ++- bigbio/biodatasets/bioscope/bioscope.py | 3 ++- bigbio/biodatasets/biosses/biosses.py | 3 ++- bigbio/biodatasets/cadec/cadec.py | 3 ++- bigbio/biodatasets/cantemist/cantemist.py | 3 ++- bigbio/biodatasets/cas/cas.py | 3 ++- bigbio/biodatasets/cellfinder/cellfinder.py | 3 ++- bigbio/biodatasets/chebi_nactem/chebi_nactem.py | 3 ++- bigbio/biodatasets/chemdner/chemdner.py | 3 ++- bigbio/biodatasets/chemprot/chemprot.py | 3 ++- bigbio/biodatasets/chia/chia.py | 3 ++- .../citation_gia_test_collection.py | 3 ++- bigbio/biodatasets/codiesp/codiesp.py | 3 ++- bigbio/biodatasets/cord_ner/cord_ner.py | 3 ++- bigbio/biodatasets/ctebmsp/ctebmsp.py | 3 ++- bigbio/biodatasets/ddi_corpus/ddi_corpus.py | 3 ++- .../biodatasets/diann_iber_eval/diann_iber_eval.py | 3 ++- bigbio/biodatasets/distemist/distemist.py | 3 ++- bigbio/biodatasets/ebm_pico/ebm_pico.py | 3 ++- bigbio/biodatasets/ehr_rel/ehr_rel.py | 3 ++- bigbio/biodatasets/essai/essai.py | 3 ++- bigbio/biodatasets/euadr/euadr.py | 3 ++- .../evidence_inference/evidence_inference.py | 3 ++- bigbio/biodatasets/gad/gad.py | 3 ++- bigbio/biodatasets/genetag/genetag.py | 3 ++- .../genia_ptm_event_corpus/genia_ptm_event_corpus.py | 3 ++- .../genia_relation_corpus/genia_relation_corpus.py | 3 ++- .../genia_term_corpus/genia_term_corpus.py | 3 ++- bigbio/biodatasets/geokhoj_v1/geokhoj_v1.py | 3 ++- bigbio/biodatasets/gnormplus/gnormplus.py | 3 ++- .../hallmarks_of_cancer/hallmarks_of_cancer.py | 3 ++- bigbio/biodatasets/hprd50/hprd50.py | 3 ++- bigbio/biodatasets/iepa/iepa.py | 3 ++- bigbio/biodatasets/jnlpba/jnlpba.py | 3 ++- bigbio/biodatasets/linnaeus/linnaeus.py | 3 ++- bigbio/biodatasets/lll/lll.py | 1 + bigbio/biodatasets/mantra_gsc/mantra_gsc.py | 3 ++- bigbio/biodatasets/mayosrs/mayosrs.py | 3 ++- bigbio/biodatasets/med_qa/med_qa.py | 3 ++- bigbio/biodatasets/medal/medal.py | 3 ++- bigbio/biodatasets/meddialog/meddialog.py | 3 ++- bigbio/biodatasets/meddocan/meddocan.py | 3 ++- bigbio/biodatasets/medhop/medhop.py | 3 ++- bigbio/biodatasets/medical_data/medical_data.py | 3 ++- bigbio/biodatasets/mediqa_nli/mediqa_nli.py | 3 ++- bigbio/biodatasets/mediqa_qa/mediqa_qa.py | 3 ++- bigbio/biodatasets/mediqa_rqe/mediqa_rqe.py | 3 ++- bigbio/biodatasets/medmentions/medmentions.py | 3 ++- bigbio/biodatasets/mednli/mednli.py | 3 ++- bigbio/biodatasets/meqsum/meqsum.py | 3 ++- bigbio/biodatasets/minimayosrs/minimayosrs.py | 3 ++- bigbio/biodatasets/mirna/mirna.py | 3 ++- bigbio/biodatasets/mlee/mlee.py | 3 ++- bigbio/biodatasets/mqp/mqp.py | 3 ++- bigbio/biodatasets/msh_wsd/msh_wsd.py | 3 ++- bigbio/biodatasets/muchmore/muchmore.py | 3 ++- bigbio/biodatasets/multi_xscience/multi_xscience.py | 3 ++- .../biodatasets/mutation_finder/mutation_finder.py | 3 ++- bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py | 3 ++- .../n2c2_2006_smokers/n2c2_2006_smokers.py | 3 ++- bigbio/biodatasets/n2c2_2008/n2c2_2008.py | 3 ++- bigbio/biodatasets/n2c2_2009/n2c2_2009.py | 3 ++- bigbio/biodatasets/n2c2_2010/n2c2_2010.py | 3 ++- bigbio/biodatasets/n2c2_2011/n2c2_2011.py | 3 ++- bigbio/biodatasets/n2c2_2014_deid/n2c2_2014_deid.py | 3 ++- .../n2c2_2014_risk_factors/n2c2_2014_risk_factors.py | 3 ++- .../biodatasets/n2c2_2018_track1/n2c2_2018_track1.py | 3 ++- .../biodatasets/n2c2_2018_track2/n2c2_2018_track2.py | 3 ++- bigbio/biodatasets/nagel/nagel.py | 3 ++- bigbio/biodatasets/ncbi_disease/ncbi_disease.py | 3 ++- bigbio/biodatasets/nlm_gene/nlm_gene.py | 3 ++- bigbio/biodatasets/nlm_wsd/nlm_wsd.py | 3 ++- bigbio/biodatasets/nlmchem/nlmchem.py | 3 ++- .../biodatasets/ntcir_13_medweb/ntcir_13_medweb.py | 3 ++- bigbio/biodatasets/osiris/osiris.py | 3 ++- bigbio/biodatasets/paramed/paramed.py | 3 ++- bigbio/biodatasets/pcr/pcr.py | 3 ++- bigbio/biodatasets/pdr/pdr.py | 3 ++- bigbio/biodatasets/pharmaconer/pharmaconer.py | 3 ++- bigbio/biodatasets/pho_ner/pho_ner.py | 3 ++- .../biodatasets/pico_extraction/pico_extraction.py | 3 ++- bigbio/biodatasets/pmc_patients/pmc_patients.py | 3 ++- bigbio/biodatasets/progene/progene.py | 3 ++- bigbio/biodatasets/psytar/psytar.py | 3 ++- bigbio/biodatasets/pubhealth/pubhealth.py | 3 ++- bigbio/biodatasets/pubmed_qa/pubmed_qa.py | 1 + .../biodatasets/pubtator_central/pubtator_central.py | 3 ++- bigbio/biodatasets/quaero/quaero.py | 3 ++- bigbio/biodatasets/scai_chemical/scai_chemical.py | 3 ++- bigbio/biodatasets/scai_disease/scai_disease.py | 3 ++- bigbio/biodatasets/scicite/scicite.py | 3 ++- bigbio/biodatasets/scielo/scielo.py | 3 ++- bigbio/biodatasets/scifact/scifact.py | 3 ++- bigbio/biodatasets/sciq/sciq.py | 3 ++- bigbio/biodatasets/scitail/scitail.py | 3 ++- bigbio/biodatasets/seth_corpus/seth_corpus.py | 3 ++- bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py | 3 ++- .../swedish_medical_ner/swedish_medical_ner.py | 3 ++- bigbio/biodatasets/thomas2011/thomas2011.py | 3 ++- bigbio/biodatasets/tmvar_v1/tmvar_v1.py | 3 ++- bigbio/biodatasets/tmvar_v2/tmvar_v2.py | 3 ++- bigbio/biodatasets/tmvar_v3/tmvar_v3.py | 3 ++- bigbio/biodatasets/twadrl/twadrl.py | 3 ++- bigbio/biodatasets/umnsrs/umnsrs.py | 3 ++- bigbio/biodatasets/verspoor_2013/verspoor_2013.py | 3 ++- 129 files changed, 276 insertions(+), 127 deletions(-) diff --git a/bigbio/biodatasets/an_em/an_em.py b/bigbio/biodatasets/an_em/an_em.py index f3460349a..4d956684a 100644 --- a/bigbio/biodatasets/an_em/an_em.py +++ b/bigbio/biodatasets/an_em/an_em.py @@ -29,9 +29,10 @@ import bigbio.utils.parsing as parse from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.ANATOMY] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/anat_em/anat_em.py b/bigbio/biodatasets/anat_em/anat_em.py index c74125c24..c58f6fb19 100644 --- a/bigbio/biodatasets/anat_em/anat_em.py +++ b/bigbio/biodatasets/anat_em/anat_em.py @@ -27,9 +27,10 @@ import bigbio.utils.parsing as parsing from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.ANATOMY] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ask_a_patient/ask_a_patient.py b/bigbio/biodatasets/ask_a_patient/ask_a_patient.py index dee74515d..0b4eeffe4 100644 --- a/bigbio/biodatasets/ask_a_patient/ask_a_patient.py +++ b/bigbio/biodatasets/ask_a_patient/ask_a_patient.py @@ -21,11 +21,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "ask_a_patient" +_TAGS = [Tags.SOCIAL_MEDIA] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bc5cdr/bc5cdr.py b/bigbio/biodatasets/bc5cdr/bc5cdr.py index 47af693c0..5e729b270 100644 --- a/bigbio/biodatasets/bc5cdr/bc5cdr.py +++ b/bigbio/biodatasets/bc5cdr/bc5cdr.py @@ -31,10 +31,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [Tags.DISEASE, Tags.CHEMICAL, Tags.CHEMICAL_DISEASE_RELATION, Tags.MESH] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bc7_litcovid/bc7_litcovid.py b/bigbio/biodatasets/bc7_litcovid/bc7_litcovid.py index 2e9ca9e95..50543a186 100644 --- a/bigbio/biodatasets/bc7_litcovid/bc7_litcovid.py +++ b/bigbio/biodatasets/bc7_litcovid/bc7_litcovid.py @@ -20,9 +20,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.COVID] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bio_sim_verb/bio_sim_verb.py b/bigbio/biodatasets/bio_sim_verb/bio_sim_verb.py index 05db39fd8..afab00599 100644 --- a/bigbio/biodatasets/bio_sim_verb/bio_sim_verb.py +++ b/bigbio/biodatasets/bio_sim_verb/bio_sim_verb.py @@ -27,10 +27,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses # TODO: Add BibTeX citation +_TAGS = [Tags.LEXICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bio_simlex/bio_simlex.py b/bigbio/biodatasets/bio_simlex/bio_simlex.py index 6b8fc6f8b..2a9ceceaa 100644 --- a/bigbio/biodatasets/bio_simlex/bio_simlex.py +++ b/bigbio/biodatasets/bio_simlex/bio_simlex.py @@ -27,10 +27,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses # TODO: Add BibTeX citation +_TAGS = [Tags.LEXICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py b/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py index 4672c3f53..680de353c 100644 --- a/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py +++ b/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py @@ -51,9 +51,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.DECS] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py b/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py index f5668647c..b17ed3828 100644 --- a/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py +++ b/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py @@ -32,9 +32,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/bioasq_task_c_2017/bioasq_task_c_2017.py b/bigbio/biodatasets/bioasq_task_c_2017/bioasq_task_c_2017.py index 8012f3800..2aeeb7292 100644 --- a/bigbio/biodatasets/bioasq_task_c_2017/bioasq_task_c_2017.py +++ b/bigbio/biodatasets/bioasq_task_c_2017/bioasq_task_c_2017.py @@ -23,9 +23,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.GRANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/bioinfer/bioinfer.py b/bigbio/biodatasets/bioinfer/bioinfer.py index 8a71bbf59..dd1a7cfdd 100644 --- a/bigbio/biodatasets/bioinfer/bioinfer.py +++ b/bigbio/biodatasets/bioinfer/bioinfer.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.PPI] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py b/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py index 751172900..41e8cca74 100644 --- a/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py +++ b/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.QA_HOW, Tags.QA_WHY] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/biomrc/biomrc.py b/bigbio/biodatasets/biomrc/biomrc.py index a80f0955c..df849298a 100644 --- a/bigbio/biodatasets/biomrc/biomrc.py +++ b/bigbio/biodatasets/biomrc/biomrc.py @@ -31,9 +31,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.QA_MULTIPLE_CHOICE, Tags.MRC, Tags.QA_CLOZE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_shared_task_2009/bionlp_shared_task_2009.py b/bigbio/biodatasets/bionlp_shared_task_2009/bionlp_shared_task_2009.py index 1f32a25d6..4e3303137 100644 --- a/bigbio/biodatasets/bionlp_shared_task_2009/bionlp_shared_task_2009.py +++ b/bigbio/biodatasets/bionlp_shared_task_2009/bionlp_shared_task_2009.py @@ -21,10 +21,15 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import brat_parse_to_bigbio_kb, parse_brat_file +# http://www.wikicfp.com/cfp/servlet/event.showcfp?eventid=4605©ownerid=320 +# Task 1. Event detection and characterization +# Task 2. Event argument recognition +# Task 3. Recognition of negations and speculations +_TAGS = [Tags.PPI, Tags.NEGATION, Tags.SPECULATION, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2011_epi/bionlp_st_2011_epi.py b/bigbio/biodatasets/bionlp_st_2011_epi/bionlp_st_2011_epi.py index 4c2d5991b..7a6ea0ab1 100644 --- a/bigbio/biodatasets/bionlp_st_2011_epi/bionlp_st_2011_epi.py +++ b/bigbio/biodatasets/bionlp_st_2011_epi/bionlp_st_2011_epi.py @@ -21,13 +21,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2011_epi" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.EPIGENETICS, Tags.NEGATION, Tags.SPECULATION, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2011_ge/bionlp_st_2011_ge.py b/bigbio/biodatasets/bionlp_st_2011_ge/bionlp_st_2011_ge.py index 112c03a4a..3eab0c715 100644 --- a/bigbio/biodatasets/bionlp_st_2011_ge/bionlp_st_2011_ge.py +++ b/bigbio/biodatasets/bionlp_st_2011_ge/bionlp_st_2011_ge.py @@ -20,13 +20,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2011_ge" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py b/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py index 1d640ac3f..c5e0734d5 100644 --- a/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py +++ b/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py @@ -20,13 +20,21 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2011_id" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [ + Tags.DISEASE, + Tags.GENE, + Tags.CHEMICAL, + Tags.ORGANISM, + Tags.SPECULATION, + Tags.NEGATION, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2011_rel/bionlp_st_2011_rel.py b/bigbio/biodatasets/bionlp_st_2011_rel/bionlp_st_2011_rel.py index d6539fbb1..92a9c3b27 100644 --- a/bigbio/biodatasets/bionlp_st_2011_rel/bionlp_st_2011_rel.py +++ b/bigbio/biodatasets/bionlp_st_2011_rel/bionlp_st_2011_rel.py @@ -20,13 +20,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2011_rel" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.PART_OF, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py b/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py index f99326ec4..a72d0386d 100644 --- a/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py +++ b/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py @@ -20,12 +20,22 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2013_cg" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [ + Tags.DISEASE, + Tags.CANCER, + Tags.TISSUE, + Tags.ORGANISM, + Tags.CELL, + Tags.GENE, + Tags.CHEMICAL, + Tags.PATHWAY, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2013_ge/bionlp_st_2013_ge.py b/bigbio/biodatasets/bionlp_st_2013_ge/bionlp_st_2013_ge.py index 93dfa58f3..74a76bdea 100644 --- a/bigbio/biodatasets/bionlp_st_2013_ge/bionlp_st_2013_ge.py +++ b/bigbio/biodatasets/bionlp_st_2013_ge/bionlp_st_2013_ge.py @@ -20,13 +20,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2013_ge" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py b/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py index 277dfcec3..1241b22c5 100644 --- a/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py +++ b/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py @@ -21,13 +21,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2013_gro" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.GENE, Tags.ORGANISM, Tags.CELL, Tags.TISSUE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py b/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py index 69fd79f90..f685ff3ea 100644 --- a/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py +++ b/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py @@ -20,12 +20,13 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2013_pc" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.GENE, Tags.CHEMICAL, Tags.PATHWAY, Tags.NEGATION, Tags.SPECULATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py b/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py index 026c83374..8d464b851 100644 --- a/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py +++ b/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py @@ -20,13 +20,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "bionlp_st_2019_bb" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [Tags.ORGANISM] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/biored/biored.py b/bigbio/biodatasets/biored/biored.py index 250ce8374..b45bdacd1 100644 --- a/bigbio/biodatasets/biored/biored.py +++ b/bigbio/biodatasets/biored/biored.py @@ -26,10 +26,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses # TODO: Add BibTeX citation +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.CHEMICAL, Tags.VARIANT, Tags.PPI] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/biorelex/biorelex.py b/bigbio/biodatasets/biorelex/biorelex.py index f6dac279a..1b1d2a129 100644 --- a/bigbio/biodatasets/biorelex/biorelex.py +++ b/bigbio/biodatasets/biorelex/biorelex.py @@ -35,10 +35,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses # TODO: Add BibTeX citation +_TAGS = [Tags.GENE, Tags.CHEMICAL, Tags.VARIANT, Tags.NEGATION, Tags.SPECULATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bioscope/bioscope.py b/bigbio/biodatasets/bioscope/bioscope.py index 5af2077a2..9e7d2e222 100644 --- a/bigbio/biodatasets/bioscope/bioscope.py +++ b/bigbio/biodatasets/bioscope/bioscope.py @@ -35,9 +35,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [Tags.NEGATION, Tags.SPECULATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/biosses/biosses.py b/bigbio/biodatasets/biosses/biosses.py index 059a03065..a55a313c2 100644 --- a/bigbio/biodatasets/biosses/biosses.py +++ b/bigbio/biodatasets/biosses/biosses.py @@ -28,11 +28,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "biosses" +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cadec/cadec.py b/bigbio/biodatasets/cadec/cadec.py index 3eb3f6da3..13784fd6f 100644 --- a/bigbio/biodatasets/cadec/cadec.py +++ b/bigbio/biodatasets/cadec/cadec.py @@ -35,9 +35,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cantemist/cantemist.py b/bigbio/biodatasets/cantemist/cantemist.py index 6a140d2a3..9d0c9d897 100644 --- a/bigbio/biodatasets/cantemist/cantemist.py +++ b/bigbio/biodatasets/cantemist/cantemist.py @@ -31,9 +31,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cas/cas.py b/bigbio/biodatasets/cas/cas.py index d563be29c..6c421ca86 100644 --- a/bigbio/biodatasets/cas/cas.py +++ b/bigbio/biodatasets/cas/cas.py @@ -6,9 +6,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.FR] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/cellfinder/cellfinder.py b/bigbio/biodatasets/cellfinder/cellfinder.py index 935a919c0..9987ee5f6 100644 --- a/bigbio/biodatasets/cellfinder/cellfinder.py +++ b/bigbio/biodatasets/cellfinder/cellfinder.py @@ -28,9 +28,10 @@ import bigbio.utils.parsing as parsing from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chebi_nactem/chebi_nactem.py b/bigbio/biodatasets/chebi_nactem/chebi_nactem.py index c6e96c6f6..b7edd94f5 100644 --- a/bigbio/biodatasets/chebi_nactem/chebi_nactem.py +++ b/bigbio/biodatasets/chebi_nactem/chebi_nactem.py @@ -21,10 +21,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import parse_brat_file +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chemdner/chemdner.py b/bigbio/biodatasets/chemdner/chemdner.py index 7b0b974f1..c1ec7c880 100644 --- a/bigbio/biodatasets/chemdner/chemdner.py +++ b/bigbio/biodatasets/chemdner/chemdner.py @@ -22,10 +22,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chemprot/chemprot.py b/bigbio/biodatasets/chemprot/chemprot.py index 620a1a449..c91d5aa81 100644 --- a/bigbio/biodatasets/chemprot/chemprot.py +++ b/bigbio/biodatasets/chemprot/chemprot.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chia/chia.py b/bigbio/biodatasets/chia/chia.py index 2328a4599..cc1b3d7ab 100644 --- a/bigbio/biodatasets/chia/chia.py +++ b/bigbio/biodatasets/chia/chia.py @@ -25,9 +25,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py b/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py index 0713a87f1..63efad002 100644 --- a/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py +++ b/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py @@ -24,9 +24,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/codiesp/codiesp.py b/bigbio/biodatasets/codiesp/codiesp.py index 1cede6227..aea9c7860 100644 --- a/bigbio/biodatasets/codiesp/codiesp.py +++ b/bigbio/biodatasets/codiesp/codiesp.py @@ -35,9 +35,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cord_ner/cord_ner.py b/bigbio/biodatasets/cord_ner/cord_ner.py index 8724cf64f..38f956daa 100644 --- a/bigbio/biodatasets/cord_ner/cord_ner.py +++ b/bigbio/biodatasets/cord_ner/cord_ner.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ctebmsp/ctebmsp.py b/bigbio/biodatasets/ctebmsp/ctebmsp.py index 92ca3519d..42c23ef23 100644 --- a/bigbio/biodatasets/ctebmsp/ctebmsp.py +++ b/bigbio/biodatasets/ctebmsp/ctebmsp.py @@ -31,9 +31,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.ES] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ddi_corpus/ddi_corpus.py b/bigbio/biodatasets/ddi_corpus/ddi_corpus.py index 4d8fb8937..970cdbb6b 100644 --- a/bigbio/biodatasets/ddi_corpus/ddi_corpus.py +++ b/bigbio/biodatasets/ddi_corpus/ddi_corpus.py @@ -27,9 +27,10 @@ import bigbio.utils.parsing as parsing from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py b/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py index a9f4a9279..9ae958463 100644 --- a/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py +++ b/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py @@ -27,9 +27,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/distemist/distemist.py b/bigbio/biodatasets/distemist/distemist.py index b9dfaf5d0..798c568a0 100644 --- a/bigbio/biodatasets/distemist/distemist.py +++ b/bigbio/biodatasets/distemist/distemist.py @@ -21,9 +21,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/ebm_pico/ebm_pico.py b/bigbio/biodatasets/ebm_pico/ebm_pico.py index 5e7078e06..f20a3379d 100644 --- a/bigbio/biodatasets/ebm_pico/ebm_pico.py +++ b/bigbio/biodatasets/ebm_pico/ebm_pico.py @@ -26,9 +26,10 @@ import datasets from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ehr_rel/ehr_rel.py b/bigbio/biodatasets/ehr_rel/ehr_rel.py index 90235ee4a..2ad2f965a 100644 --- a/bigbio/biodatasets/ehr_rel/ehr_rel.py +++ b/bigbio/biodatasets/ehr_rel/ehr_rel.py @@ -28,9 +28,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/essai/essai.py b/bigbio/biodatasets/essai/essai.py index 275aa115c..289055a63 100644 --- a/bigbio/biodatasets/essai/essai.py +++ b/bigbio/biodatasets/essai/essai.py @@ -6,9 +6,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.FR] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/euadr/euadr.py b/bigbio/biodatasets/euadr/euadr.py index 35b276646..e68a1feb1 100644 --- a/bigbio/biodatasets/euadr/euadr.py +++ b/bigbio/biodatasets/euadr/euadr.py @@ -4,9 +4,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/evidence_inference/evidence_inference.py b/bigbio/biodatasets/evidence_inference/evidence_inference.py index 83fd2ca74..e21ce4f47 100644 --- a/bigbio/biodatasets/evidence_inference/evidence_inference.py +++ b/bigbio/biodatasets/evidence_inference/evidence_inference.py @@ -32,9 +32,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/gad/gad.py b/bigbio/biodatasets/gad/gad.py index 4a9286cea..d12e7b5b2 100644 --- a/bigbio/biodatasets/gad/gad.py +++ b/bigbio/biodatasets/gad/gad.py @@ -6,13 +6,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "gad" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [] _LANGUAGES = [Lang.EN] _LOCAL = False _CITATION = """\ diff --git a/bigbio/biodatasets/genetag/genetag.py b/bigbio/biodatasets/genetag/genetag.py index e53b49185..bfe13bf53 100644 --- a/bigbio/biodatasets/genetag/genetag.py +++ b/bigbio/biodatasets/genetag/genetag.py @@ -29,9 +29,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py b/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py index 0e3f25369..ed8741663 100644 --- a/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py +++ b/bigbio/biodatasets/genia_ptm_event_corpus/genia_ptm_event_corpus.py @@ -29,9 +29,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _LOCAL = False _CITATION = """\ diff --git a/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py b/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py index f0a730590..81c833687 100644 --- a/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py +++ b/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py @@ -31,9 +31,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py b/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py index 0ae321ceb..7516e830d 100644 --- a/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py +++ b/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py @@ -28,9 +28,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/geokhoj_v1/geokhoj_v1.py b/bigbio/biodatasets/geokhoj_v1/geokhoj_v1.py index 1954035f8..5b37531a3 100644 --- a/bigbio/biodatasets/geokhoj_v1/geokhoj_v1.py +++ b/bigbio/biodatasets/geokhoj_v1/geokhoj_v1.py @@ -28,9 +28,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/gnormplus/gnormplus.py b/bigbio/biodatasets/gnormplus/gnormplus.py index 7fd0e750c..28d16d360 100644 --- a/bigbio/biodatasets/gnormplus/gnormplus.py +++ b/bigbio/biodatasets/gnormplus/gnormplus.py @@ -23,10 +23,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py b/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py index 83d19030f..73439fe04 100644 --- a/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py +++ b/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py @@ -18,9 +18,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/hprd50/hprd50.py b/bigbio/biodatasets/hprd50/hprd50.py index 91b18470e..63de60017 100644 --- a/bigbio/biodatasets/hprd50/hprd50.py +++ b/bigbio/biodatasets/hprd50/hprd50.py @@ -38,10 +38,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses # TODO: Add BibTeX citation +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/iepa/iepa.py b/bigbio/biodatasets/iepa/iepa.py index 5efffd9f6..157893562 100644 --- a/bigbio/biodatasets/iepa/iepa.py +++ b/bigbio/biodatasets/iepa/iepa.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/jnlpba/jnlpba.py b/bigbio/biodatasets/jnlpba/jnlpba.py index d163c385e..a10a42981 100644 --- a/bigbio/biodatasets/jnlpba/jnlpba.py +++ b/bigbio/biodatasets/jnlpba/jnlpba.py @@ -26,9 +26,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/linnaeus/linnaeus.py b/bigbio/biodatasets/linnaeus/linnaeus.py index 14c1b6ef4..4a079d1d9 100644 --- a/bigbio/biodatasets/linnaeus/linnaeus.py +++ b/bigbio/biodatasets/linnaeus/linnaeus.py @@ -32,9 +32,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/lll/lll.py b/bigbio/biodatasets/lll/lll.py index 34259f128..ccc4eca80 100644 --- a/bigbio/biodatasets/lll/lll.py +++ b/bigbio/biodatasets/lll/lll.py @@ -39,6 +39,7 @@ from bigbio.utils.constants import BigBioValues, Lang, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mantra_gsc/mantra_gsc.py b/bigbio/biodatasets/mantra_gsc/mantra_gsc.py index e014f006d..0db20bd50 100644 --- a/bigbio/biodatasets/mantra_gsc/mantra_gsc.py +++ b/bigbio/biodatasets/mantra_gsc/mantra_gsc.py @@ -22,9 +22,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.FR, Lang.DE, Lang.NL, Lang.ES] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mayosrs/mayosrs.py b/bigbio/biodatasets/mayosrs/mayosrs.py index 033a93b89..160a66668 100644 --- a/bigbio/biodatasets/mayosrs/mayosrs.py +++ b/bigbio/biodatasets/mayosrs/mayosrs.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/med_qa/med_qa.py b/bigbio/biodatasets/med_qa/med_qa.py index e83b70e90..4cdbc1d96 100644 --- a/bigbio/biodatasets/med_qa/med_qa.py +++ b/bigbio/biodatasets/med_qa/med_qa.py @@ -29,9 +29,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/medal/medal.py b/bigbio/biodatasets/medal/medal.py index 2766f97b7..03df40fc3 100644 --- a/bigbio/biodatasets/medal/medal.py +++ b/bigbio/biodatasets/medal/medal.py @@ -26,11 +26,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses logger = datasets.logging.get_logger(__name__) +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/meddialog/meddialog.py b/bigbio/biodatasets/meddialog/meddialog.py index 90e77e55c..4d0e95b64 100644 --- a/bigbio/biodatasets/meddialog/meddialog.py +++ b/bigbio/biodatasets/meddialog/meddialog.py @@ -20,11 +20,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "meddialog" +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.ZH] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/meddocan/meddocan.py b/bigbio/biodatasets/meddocan/meddocan.py index a2e66d64b..e1fb393d1 100644 --- a/bigbio/biodatasets/meddocan/meddocan.py +++ b/bigbio/biodatasets/meddocan/meddocan.py @@ -29,9 +29,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/medhop/medhop.py b/bigbio/biodatasets/medhop/medhop.py index 1b6012a7c..96c926399 100644 --- a/bigbio/biodatasets/medhop/medhop.py +++ b/bigbio/biodatasets/medhop/medhop.py @@ -20,9 +20,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/medical_data/medical_data.py b/bigbio/biodatasets/medical_data/medical_data.py index 986324525..80ddfdef7 100644 --- a/bigbio/biodatasets/medical_data/medical_data.py +++ b/bigbio/biodatasets/medical_data/medical_data.py @@ -21,9 +21,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _LOCAL = True _CITATION = """\ diff --git a/bigbio/biodatasets/mediqa_nli/mediqa_nli.py b/bigbio/biodatasets/mediqa_nli/mediqa_nli.py index 3b82f39fa..153df024a 100644 --- a/bigbio/biodatasets/mediqa_nli/mediqa_nli.py +++ b/bigbio/biodatasets/mediqa_nli/mediqa_nli.py @@ -44,9 +44,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/mediqa_qa/mediqa_qa.py b/bigbio/biodatasets/mediqa_qa/mediqa_qa.py index 0e85d9268..1c26254e7 100644 --- a/bigbio/biodatasets/mediqa_qa/mediqa_qa.py +++ b/bigbio/biodatasets/mediqa_qa/mediqa_qa.py @@ -25,9 +25,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/mediqa_rqe/mediqa_rqe.py b/bigbio/biodatasets/mediqa_rqe/mediqa_rqe.py index 9b9fe79ed..ad61f5313 100644 --- a/bigbio/biodatasets/mediqa_rqe/mediqa_rqe.py +++ b/bigbio/biodatasets/mediqa_rqe/mediqa_rqe.py @@ -25,9 +25,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/medmentions/medmentions.py b/bigbio/biodatasets/medmentions/medmentions.py index a1e8e2d96..a1322f7e1 100644 --- a/bigbio/biodatasets/medmentions/medmentions.py +++ b/bigbio/biodatasets/medmentions/medmentions.py @@ -43,9 +43,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mednli/mednli.py b/bigbio/biodatasets/mednli/mednli.py index 5e6c8cace..4488852fd 100644 --- a/bigbio/biodatasets/mednli/mednli.py +++ b/bigbio/biodatasets/mednli/mednli.py @@ -42,9 +42,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/meqsum/meqsum.py b/bigbio/biodatasets/meqsum/meqsum.py index 684877dd6..21fe7f58c 100644 --- a/bigbio/biodatasets/meqsum/meqsum.py +++ b/bigbio/biodatasets/meqsum/meqsum.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/minimayosrs/minimayosrs.py b/bigbio/biodatasets/minimayosrs/minimayosrs.py index 1169fa673..f8f095bbe 100644 --- a/bigbio/biodatasets/minimayosrs/minimayosrs.py +++ b/bigbio/biodatasets/minimayosrs/minimayosrs.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/mirna/mirna.py b/bigbio/biodatasets/mirna/mirna.py index 2b128f216..aa7e72793 100644 --- a/bigbio/biodatasets/mirna/mirna.py +++ b/bigbio/biodatasets/mirna/mirna.py @@ -19,9 +19,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mlee/mlee.py b/bigbio/biodatasets/mlee/mlee.py index d4e3db091..478079624 100644 --- a/bigbio/biodatasets/mlee/mlee.py +++ b/bigbio/biodatasets/mlee/mlee.py @@ -25,13 +25,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "mlee" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mqp/mqp.py b/bigbio/biodatasets/mqp/mqp.py index 6adf36a95..f1d47b1f3 100644 --- a/bigbio/biodatasets/mqp/mqp.py +++ b/bigbio/biodatasets/mqp/mqp.py @@ -26,9 +26,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/msh_wsd/msh_wsd.py b/bigbio/biodatasets/msh_wsd/msh_wsd.py index 59525ce36..2195106ac 100644 --- a/bigbio/biodatasets/msh_wsd/msh_wsd.py +++ b/bigbio/biodatasets/msh_wsd/msh_wsd.py @@ -40,9 +40,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/muchmore/muchmore.py b/bigbio/biodatasets/muchmore/muchmore.py index f744477fb..3ae9d047d 100644 --- a/bigbio/biodatasets/muchmore/muchmore.py +++ b/bigbio/biodatasets/muchmore/muchmore.py @@ -73,9 +73,10 @@ # Buitelaar, Paul / Declerck, Thierry / Sacaleanu, Bogdan / Vintar, Spela / Raileanu, Diana / Crispi, Claudia: A Multi-Layered, XML-Based Approach to the Integration of Linguistic and Semantic Annotations. In: Proceedings of EACL 2003 Workshop on Language Technology and the Semantic Web (NLPXML’03), Budapest, Hungary, April 2003. from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.DE] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/multi_xscience/multi_xscience.py b/bigbio/biodatasets/multi_xscience/multi_xscience.py index ab8c55b6f..6be1347be 100644 --- a/bigbio/biodatasets/multi_xscience/multi_xscience.py +++ b/bigbio/biodatasets/multi_xscience/multi_xscience.py @@ -21,9 +21,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/mutation_finder/mutation_finder.py b/bigbio/biodatasets/mutation_finder/mutation_finder.py index 277d5db4e..5dc113a7c 100644 --- a/bigbio/biodatasets/mutation_finder/mutation_finder.py +++ b/bigbio/biodatasets/mutation_finder/mutation_finder.py @@ -20,9 +20,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py b/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py index f3cac12f6..9144f25f0 100644 --- a/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py +++ b/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py @@ -65,12 +65,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "n2c2_2006" # https://academic.oup.com/jamia/article/14/5/550/720189 +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py b/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py index 688400469..6e0fc9209 100644 --- a/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py +++ b/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py @@ -63,12 +63,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "n2c2_2006" # https://academic.oup.com/jamia/article/15/1/14/779738 +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2008/n2c2_2008.py b/bigbio/biodatasets/n2c2_2008/n2c2_2008.py index 0167def08..4b3054ac1 100644 --- a/bigbio/biodatasets/n2c2_2008/n2c2_2008.py +++ b/bigbio/biodatasets/n2c2_2008/n2c2_2008.py @@ -71,12 +71,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "n2c2_2008" # https://academic.oup.com/jamia/article/16/4/561/766997 +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2009/n2c2_2009.py b/bigbio/biodatasets/n2c2_2009/n2c2_2009.py index 3d9328a99..88f1e60c5 100644 --- a/bigbio/biodatasets/n2c2_2009/n2c2_2009.py +++ b/bigbio/biodatasets/n2c2_2009/n2c2_2009.py @@ -57,9 +57,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2010/n2c2_2010.py b/bigbio/biodatasets/n2c2_2010/n2c2_2010.py index 277081cf5..549ac121a 100644 --- a/bigbio/biodatasets/n2c2_2010/n2c2_2010.py +++ b/bigbio/biodatasets/n2c2_2010/n2c2_2010.py @@ -52,9 +52,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2011/n2c2_2011.py b/bigbio/biodatasets/n2c2_2011/n2c2_2011.py index 44328533a..67fc5e684 100644 --- a/bigbio/biodatasets/n2c2_2011/n2c2_2011.py +++ b/bigbio/biodatasets/n2c2_2011/n2c2_2011.py @@ -72,12 +72,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "n2c2_2011" # https://academic.oup.com/jamia/article/19/5/786/716138 +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2014_deid/n2c2_2014_deid.py b/bigbio/biodatasets/n2c2_2014_deid/n2c2_2014_deid.py index 1e3992a19..75f972cb8 100644 --- a/bigbio/biodatasets/n2c2_2014_deid/n2c2_2014_deid.py +++ b/bigbio/biodatasets/n2c2_2014_deid/n2c2_2014_deid.py @@ -59,9 +59,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _LOCAL = True _CITATION = """\ diff --git a/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py b/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py index 524a48fca..fec27a82b 100644 --- a/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py +++ b/bigbio/biodatasets/n2c2_2014_risk_factors/n2c2_2014_risk_factors.py @@ -59,9 +59,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _LOCAL = True _CITATION = """\ diff --git a/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py b/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py index 27d0f5ae9..59411a293 100644 --- a/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py +++ b/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py @@ -43,9 +43,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py b/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py index ff26a9ebe..13ddc19b1 100644 --- a/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py +++ b/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py @@ -46,9 +46,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/nagel/nagel.py b/bigbio/biodatasets/nagel/nagel.py index 260224c62..fd8a05f68 100644 --- a/bigbio/biodatasets/nagel/nagel.py +++ b/bigbio/biodatasets/nagel/nagel.py @@ -23,9 +23,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ncbi_disease/ncbi_disease.py b/bigbio/biodatasets/ncbi_disease/ncbi_disease.py index 4d85e9ac2..1efee20e5 100644 --- a/bigbio/biodatasets/ncbi_disease/ncbi_disease.py +++ b/bigbio/biodatasets/ncbi_disease/ncbi_disease.py @@ -26,9 +26,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/nlm_gene/nlm_gene.py b/bigbio/biodatasets/nlm_gene/nlm_gene.py index 2d7e1a4bb..1a6c0e06f 100644 --- a/bigbio/biodatasets/nlm_gene/nlm_gene.py +++ b/bigbio/biodatasets/nlm_gene/nlm_gene.py @@ -22,10 +22,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/nlm_wsd/nlm_wsd.py b/bigbio/biodatasets/nlm_wsd/nlm_wsd.py index 01620230d..7437d8df2 100644 --- a/bigbio/biodatasets/nlm_wsd/nlm_wsd.py +++ b/bigbio/biodatasets/nlm_wsd/nlm_wsd.py @@ -53,9 +53,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/nlmchem/nlmchem.py b/bigbio/biodatasets/nlmchem/nlmchem.py index ec83fe2ea..e816e3788 100644 --- a/bigbio/biodatasets/nlmchem/nlmchem.py +++ b/bigbio/biodatasets/nlmchem/nlmchem.py @@ -22,10 +22,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py b/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py index 7066df6e4..ff8734739 100644 --- a/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py +++ b/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py @@ -63,9 +63,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.ZH, Lang.JA] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/osiris/osiris.py b/bigbio/biodatasets/osiris/osiris.py index 3929ca5d9..b83262563 100644 --- a/bigbio/biodatasets/osiris/osiris.py +++ b/bigbio/biodatasets/osiris/osiris.py @@ -24,9 +24,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/paramed/paramed.py b/bigbio/biodatasets/paramed/paramed.py index 6791791e0..50966a93a 100644 --- a/bigbio/biodatasets/paramed/paramed.py +++ b/bigbio/biodatasets/paramed/paramed.py @@ -26,12 +26,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses logger = datasets.logging.get_logger(__name__) +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.ZH] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pcr/pcr.py b/bigbio/biodatasets/pcr/pcr.py index e2e10566a..28e3987e9 100644 --- a/bigbio/biodatasets/pcr/pcr.py +++ b/bigbio/biodatasets/pcr/pcr.py @@ -25,9 +25,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pdr/pdr.py b/bigbio/biodatasets/pdr/pdr.py index a41255e69..1c7bb9f77 100644 --- a/bigbio/biodatasets/pdr/pdr.py +++ b/bigbio/biodatasets/pdr/pdr.py @@ -28,9 +28,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pharmaconer/pharmaconer.py b/bigbio/biodatasets/pharmaconer/pharmaconer.py index 61a28ab9f..f20fd87f1 100644 --- a/bigbio/biodatasets/pharmaconer/pharmaconer.py +++ b/bigbio/biodatasets/pharmaconer/pharmaconer.py @@ -31,9 +31,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pho_ner/pho_ner.py b/bigbio/biodatasets/pho_ner/pho_ner.py index 28f8829a2..4ae3852d1 100644 --- a/bigbio/biodatasets/pho_ner/pho_ner.py +++ b/bigbio/biodatasets/pho_ner/pho_ner.py @@ -20,9 +20,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense +_TAGS = [] _LANGUAGES = [Lang.VI] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pico_extraction/pico_extraction.py b/bigbio/biodatasets/pico_extraction/pico_extraction.py index b05092615..ab4c36f25 100644 --- a/bigbio/biodatasets/pico_extraction/pico_extraction.py +++ b/bigbio/biodatasets/pico_extraction/pico_extraction.py @@ -27,9 +27,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pmc_patients/pmc_patients.py b/bigbio/biodatasets/pmc_patients/pmc_patients.py index b12a79ae2..05823f401 100644 --- a/bigbio/biodatasets/pmc_patients/pmc_patients.py +++ b/bigbio/biodatasets/pmc_patients/pmc_patients.py @@ -27,9 +27,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/progene/progene.py b/bigbio/biodatasets/progene/progene.py index 49aec1dbd..f1ce6223a 100644 --- a/bigbio/biodatasets/progene/progene.py +++ b/bigbio/biodatasets/progene/progene.py @@ -22,9 +22,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/psytar/psytar.py b/bigbio/biodatasets/psytar/psytar.py index 61a16aa61..e0931739a 100644 --- a/bigbio/biodatasets/psytar/psytar.py +++ b/bigbio/biodatasets/psytar/psytar.py @@ -51,9 +51,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/pubhealth/pubhealth.py b/bigbio/biodatasets/pubhealth/pubhealth.py index 63c411bfb..5320c16e6 100644 --- a/bigbio/biodatasets/pubhealth/pubhealth.py +++ b/bigbio/biodatasets/pubhealth/pubhealth.py @@ -26,11 +26,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses logger = datasets.utils.logging.get_logger(__name__) +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pubmed_qa/pubmed_qa.py b/bigbio/biodatasets/pubmed_qa/pubmed_qa.py index c0e0228f2..4bdf15062 100644 --- a/bigbio/biodatasets/pubmed_qa/pubmed_qa.py +++ b/bigbio/biodatasets/pubmed_qa/pubmed_qa.py @@ -30,6 +30,7 @@ from bigbio.utils.constants import BigBioValues, Lang, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pubtator_central/pubtator_central.py b/bigbio/biodatasets/pubtator_central/pubtator_central.py index 972000e64..50048a96f 100644 --- a/bigbio/biodatasets/pubtator_central/pubtator_central.py +++ b/bigbio/biodatasets/pubtator_central/pubtator_central.py @@ -48,9 +48,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/quaero/quaero.py b/bigbio/biodatasets/quaero/quaero.py index 09a8e0598..29558a115 100644 --- a/bigbio/biodatasets/quaero/quaero.py +++ b/bigbio/biodatasets/quaero/quaero.py @@ -5,10 +5,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [] _LANGUAGES = [Lang.FR] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/scai_chemical/scai_chemical.py b/bigbio/biodatasets/scai_chemical/scai_chemical.py index 1abe0fb03..e3c4ef800 100644 --- a/bigbio/biodatasets/scai_chemical/scai_chemical.py +++ b/bigbio/biodatasets/scai_chemical/scai_chemical.py @@ -28,9 +28,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/scai_disease/scai_disease.py b/bigbio/biodatasets/scai_disease/scai_disease.py index 711e54b19..4b7905d9b 100644 --- a/bigbio/biodatasets/scai_disease/scai_disease.py +++ b/bigbio/biodatasets/scai_disease/scai_disease.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/scicite/scicite.py b/bigbio/biodatasets/scicite/scicite.py index 0fe741492..3a0f3284b 100644 --- a/bigbio/biodatasets/scicite/scicite.py +++ b/bigbio/biodatasets/scicite/scicite.py @@ -37,9 +37,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/scielo/scielo.py b/bigbio/biodatasets/scielo/scielo.py index 73aea9985..44659df74 100644 --- a/bigbio/biodatasets/scielo/scielo.py +++ b/bigbio/biodatasets/scielo/scielo.py @@ -21,9 +21,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.ES, Lang.PT] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/scifact/scifact.py b/bigbio/biodatasets/scifact/scifact.py index 22065ec20..14a48e21a 100644 --- a/bigbio/biodatasets/scifact/scifact.py +++ b/bigbio/biodatasets/scifact/scifact.py @@ -22,9 +22,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/sciq/sciq.py b/bigbio/biodatasets/sciq/sciq.py index 2f23906d7..54f44e622 100644 --- a/bigbio/biodatasets/sciq/sciq.py +++ b/bigbio/biodatasets/sciq/sciq.py @@ -20,11 +20,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "sciq" +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/scitail/scitail.py b/bigbio/biodatasets/scitail/scitail.py index b945bdaa9..c5dcdca57 100644 --- a/bigbio/biodatasets/scitail/scitail.py +++ b/bigbio/biodatasets/scitail/scitail.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/seth_corpus/seth_corpus.py b/bigbio/biodatasets/seth_corpus/seth_corpus.py index 70d4c6d48..82b84b534 100644 --- a/bigbio/biodatasets/seth_corpus/seth_corpus.py +++ b/bigbio/biodatasets/seth_corpus/seth_corpus.py @@ -28,9 +28,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py b/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py index 1cf5812ab..a6b16123f 100644 --- a/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py +++ b/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py @@ -64,9 +64,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py b/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py index 9a1ff0769..1ec26aca1 100644 --- a/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py +++ b/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py @@ -38,11 +38,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "swedish_medical_ner" +_TAGS = [] _LANGUAGES = [Lang.SV] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/thomas2011/thomas2011.py b/bigbio/biodatasets/thomas2011/thomas2011.py index 6e7c24842..d2747c4a8 100644 --- a/bigbio/biodatasets/thomas2011/thomas2011.py +++ b/bigbio/biodatasets/thomas2011/thomas2011.py @@ -49,10 +49,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense # TODO: Add BibTeX citation +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/tmvar_v1/tmvar_v1.py b/bigbio/biodatasets/tmvar_v1/tmvar_v1.py index e2d59b74b..f6cb22e59 100644 --- a/bigbio/biodatasets/tmvar_v1/tmvar_v1.py +++ b/bigbio/biodatasets/tmvar_v1/tmvar_v1.py @@ -23,9 +23,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/tmvar_v2/tmvar_v2.py b/bigbio/biodatasets/tmvar_v2/tmvar_v2.py index 8e766d028..b522524b6 100644 --- a/bigbio/biodatasets/tmvar_v2/tmvar_v2.py +++ b/bigbio/biodatasets/tmvar_v2/tmvar_v2.py @@ -23,9 +23,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/tmvar_v3/tmvar_v3.py b/bigbio/biodatasets/tmvar_v3/tmvar_v3.py index c5b7d93dc..1e2bb9dd4 100644 --- a/bigbio/biodatasets/tmvar_v3/tmvar_v3.py +++ b/bigbio/biodatasets/tmvar_v3/tmvar_v3.py @@ -22,7 +22,7 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _CITATION = """\ @@ -44,6 +44,7 @@ copyright = {Creative Commons Attribution 4.0 International} } """ +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/twadrl/twadrl.py b/bigbio/biodatasets/twadrl/twadrl.py index d73089124..abfb83a91 100644 --- a/bigbio/biodatasets/twadrl/twadrl.py +++ b/bigbio/biodatasets/twadrl/twadrl.py @@ -21,11 +21,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses _DATASETNAME = "twadrl" +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/umnsrs/umnsrs.py b/bigbio/biodatasets/umnsrs/umnsrs.py index 6ec1416a2..8bb39e554 100644 --- a/bigbio/biodatasets/umnsrs/umnsrs.py +++ b/bigbio/biodatasets/umnsrs/umnsrs.py @@ -29,9 +29,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/verspoor_2013/verspoor_2013.py b/bigbio/biodatasets/verspoor_2013/verspoor_2013.py index be5f625e6..58f613132 100644 --- a/bigbio/biodatasets/verspoor_2013/verspoor_2013.py +++ b/bigbio/biodatasets/verspoor_2013/verspoor_2013.py @@ -32,9 +32,10 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False From 4f328b4ff898e06d273b9668e906e195f75f35c1 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Tue, 7 Jun 2022 18:43:25 +0200 Subject: [PATCH 02/20] add _TAGS --- examples/bc5cdr.py | 3 ++- examples/bioasq_task_b.py | 8 +++++++- examples/biosses.py | 3 ++- examples/chemprot.py | 3 ++- examples/hallmarks_of_cancer.py | 3 ++- examples/mlee.py | 3 ++- examples/mqp.py | 3 ++- examples/muchmore.py | 3 ++- examples/n2c2_2011.py | 3 ++- examples/nlmchem.py | 3 ++- examples/paramed.py | 7 ++++--- examples/scitail.py | 3 ++- 12 files changed, 31 insertions(+), 14 deletions(-) diff --git a/examples/bc5cdr.py b/examples/bc5cdr.py index 111d7beea..ee325c162 100644 --- a/examples/bc5cdr.py +++ b/examples/bc5cdr.py @@ -31,10 +31,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [Tags.DISEASE, Tags.CHEMICAL, Tags.CHEMICAL_DISEASE_RELATION, Tags.MESH] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/bioasq_task_b.py b/examples/bioasq_task_b.py index da38146a9..9026918f6 100644 --- a/examples/bioasq_task_b.py +++ b/examples/bioasq_task_b.py @@ -32,9 +32,15 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [ + Tags.QA_YESNO + Tags.QA_FACTOID, + Tags.QA_LIST, + Tags.QA_SUMMARY, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/examples/biosses.py b/examples/biosses.py index 059a03065..80aa75b36 100644 --- a/examples/biosses.py +++ b/examples/biosses.py @@ -28,11 +28,12 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses _DATASETNAME = "biosses" +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/examples/chemprot.py b/examples/chemprot.py index 1db648c73..c29b362ae 100644 --- a/examples/chemprot.py +++ b/examples/chemprot.py @@ -25,9 +25,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/hallmarks_of_cancer.py b/examples/hallmarks_of_cancer.py index ae8673b40..50600def9 100644 --- a/examples/hallmarks_of_cancer.py +++ b/examples/hallmarks_of_cancer.py @@ -19,9 +19,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/mlee.py b/examples/mlee.py index 2f6b09ddd..e0330d53a 100644 --- a/examples/mlee.py +++ b/examples/mlee.py @@ -25,13 +25,14 @@ from bigbio.utils import parsing, schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses _DATASETNAME = "mlee" _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/mqp.py b/examples/mqp.py index b42cbd539..c9e122bc9 100644 --- a/examples/mqp.py +++ b/examples/mqp.py @@ -26,9 +26,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/examples/muchmore.py b/examples/muchmore.py index da6bc7430..9afb2982d 100644 --- a/examples/muchmore.py +++ b/examples/muchmore.py @@ -73,9 +73,10 @@ # Buitelaar, Paul / Declerck, Thierry / Sacaleanu, Bogdan / Vintar, Spela / Raileanu, Diana / Crispi, Claudia: A Multi-Layered, XML-Based Approach to the Integration of Linguistic and Semantic Annotations. In: Proceedings of EACL 2003 Workshop on Language Technology and the Semantic Web (NLPXML’03), Budapest, Hungary, April 2003. from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/n2c2_2011.py b/examples/n2c2_2011.py index 44328533a..d1dd79f7f 100644 --- a/examples/n2c2_2011.py +++ b/examples/n2c2_2011.py @@ -72,12 +72,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses _DATASETNAME = "n2c2_2011" # https://academic.oup.com/jamia/article/19/5/786/716138 +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/examples/nlmchem.py b/examples/nlmchem.py index 945461bf0..885234462 100644 --- a/examples/nlmchem.py +++ b/examples/nlmchem.py @@ -22,10 +22,11 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/paramed.py b/examples/paramed.py index 6791791e0..518d7e623 100644 --- a/examples/paramed.py +++ b/examples/paramed.py @@ -1,7 +1,7 @@ -# coding=utf-8 +# bcoding=utf-8 # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # -# Licensed under the Apache License, Version 2.0 (the "License"); +# bicensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # @@ -26,12 +26,13 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses logger = datasets.logging.get_logger(__name__) +_TAGS = [] _LANGUAGES = [Lang.EN, Lang.ZH] _PUBMED = False _LOCAL = False diff --git a/examples/scitail.py b/examples/scitail.py index d7bf14dd9..1be23c7cc 100644 --- a/examples/scitail.py +++ b/examples/scitail.py @@ -30,9 +30,10 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tasks +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses +_TAGS = [] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False From 515d9acc3278969adc1a5df4f06d1bf79cfed9be Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Tue, 7 Jun 2022 18:43:36 +0200 Subject: [PATCH 03/20] create Tags Enum --- bigbio/utils/constants.py | 16 ++++++++++++--- bigbio/utils/resources/tags.json | 34 ++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 3 deletions(-) create mode 100644 bigbio/utils/resources/tags.json diff --git a/bigbio/utils/constants.py b/bigbio/utils/constants.py index 78a574167..8405bae3b 100644 --- a/bigbio/utils/constants.py +++ b/bigbio/utils/constants.py @@ -6,9 +6,19 @@ from bigbio.utils import resources from bigbio.utils.license import Licenses -from bigbio.utils.schemas import (entailment_features, kb_features, - pairs_features, qa_features, - text2text_features, text_features) +from bigbio.utils.schemas import ( + entailment_features, + kb_features, + pairs_features, + qa_features, + text2text_features, + text_features, +) + + +_TAGS = json.loads(pkg_resources.read_text(resources, "tags.json")) +Tags = Enum("Tags", _TAGS) + BigBioValues = SimpleNamespace(NULL="") diff --git a/bigbio/utils/resources/tags.json b/bigbio/utils/resources/tags.json new file mode 100644 index 000000000..f2daadbbb --- /dev/null +++ b/bigbio/utils/resources/tags.json @@ -0,0 +1,34 @@ +{ + "SOCIAL_MEDIA" : "Social media", + "ANATOMY" : "Anatomy", + "ORGANISM" : "Organism", + "VARIANT" : "Variant/Mutation", + "TISSUE" : "Tissue", + "CELL" : "Cells and/or cell lines", + "SPECIES" : "Species", + "GENE" : "Gene, proteins, gene products, ...", + "DISEASE" : "Disease", + "CHEMICAL" : "Chemical", + "UMLS" : "Unified Medical Language System", + "COVID" : "Coronavirus disease 2019 (COVID-19)", + "LEXICAL" : "Lexical data (e.g. word, verbs,...)", + "DECS" : "Descriptores en Ciencias de la Salud", + "QA_YESNO" : "QA with yes no answer", + "QA_FACTOID" : "QA with factoid answer", + "QA_LIST": "QA with list of factoid answer", + "QA_SUMMARY_ANSWER" : "QA with abstractive summary answer", + "QA_HOW" : "`How` question", + "QA_WHY" : "`Why` question", + "GRANT" : "Grants data", + "PPI" : "Protein-protein interaction", + "QA_CLOZE" : "Cloze test", + "MRC" : "Machine Reading Comprehension", + "QA_MULTIPLE_CHOICE" : "QA with multiple choice", + "NEGATION" : "Negation", + "SPECULATION" : "Speculation", + "EPIGENETICS" : "Epigenetics", + "PART_OF" : "Part-of relations", + "CANCER" : "Cancer", + "PATHWAY" : "Pathway", + "MESH" : "Medical Subject Headings (MeSH)" +} From beb1eb6f51cd914f235f84429d9d8f47a10bf515 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 12:20:44 +0200 Subject: [PATCH 04/20] update _TAGS --- .../ask_a_patient/ask_a_patient.py | 2 +- bigbio/biodatasets/bc5cdr/bc5cdr.py | 2 +- .../bioasq_2021_mesinesp.py | 2 +- .../bioasq_task_b/bioasq_task_b.py | 8 +- .../biology_how_why_corpus.py | 2 +- bigbio/biodatasets/biomrc/biomrc.py | 2 +- bigbio/biodatasets/cadec/cadec.py | 2 +- bigbio/biodatasets/cantemist/cantemist.py | 2 +- bigbio/biodatasets/cas/cas.py | 2 +- bigbio/biodatasets/cellfinder/cellfinder.py | 2 +- .../biodatasets/chebi_nactem/chebi_nactem.py | 2 +- bigbio/biodatasets/chemdner/chemdner.py | 2 +- bigbio/biodatasets/chemprot/chemprot.py | 2 +- bigbio/biodatasets/chia/chia.py | 2 +- .../citation_gia_test_collection.py | 137 ++++++++++-------- bigbio/biodatasets/codiesp/codiesp.py | 2 +- bigbio/biodatasets/cord_ner/cord_ner.py | 2 +- bigbio/biodatasets/ctebmsp/ctebmsp.py | 2 +- bigbio/biodatasets/ddi_corpus/ddi_corpus.py | 2 +- .../diann_iber_eval/diann_iber_eval.py | 2 +- bigbio/biodatasets/distemist/distemist.py | 2 +- bigbio/biodatasets/ebm_pico/ebm_pico.py | 30 +++- bigbio/biodatasets/ehr_rel/ehr_rel.py | 2 +- bigbio/biodatasets/essai/essai.py | 2 +- bigbio/biodatasets/euadr/euadr.py | 10 +- .../evidence_inference/evidence_inference.py | 2 +- bigbio/biodatasets/genetag/genetag.py | 2 +- .../genia_relation_corpus.py | 2 +- .../genia_term_corpus/genia_term_corpus.py | 2 +- bigbio/biodatasets/gnormplus/gnormplus.py | 2 +- .../hallmarks_of_cancer.py | 41 +++--- bigbio/biodatasets/hprd50/hprd50.py | 2 +- bigbio/biodatasets/iepa/iepa.py | 2 +- bigbio/biodatasets/jnlpba/jnlpba.py | 2 +- bigbio/biodatasets/linnaeus/linnaeus.py | 2 +- bigbio/biodatasets/lll/lll.py | 2 +- bigbio/biodatasets/mantra_gsc/mantra_gsc.py | 2 +- bigbio/biodatasets/mayosrs/mayosrs.py | 2 +- bigbio/biodatasets/med_qa/med_qa.py | 2 +- bigbio/biodatasets/meddialog/meddialog.py | 2 +- bigbio/biodatasets/meddocan/meddocan.py | 2 +- bigbio/biodatasets/medhop/medhop.py | 2 +- .../biodatasets/medical_data/medical_data.py | 2 +- bigbio/biodatasets/mediqa_qa/mediqa_qa.py | 2 +- bigbio/biodatasets/medmentions/medmentions.py | 2 +- bigbio/utils/resources/tags.json | 34 +++-- 46 files changed, 203 insertions(+), 137 deletions(-) diff --git a/bigbio/biodatasets/ask_a_patient/ask_a_patient.py b/bigbio/biodatasets/ask_a_patient/ask_a_patient.py index 0b4eeffe4..bd89c5026 100644 --- a/bigbio/biodatasets/ask_a_patient/ask_a_patient.py +++ b/bigbio/biodatasets/ask_a_patient/ask_a_patient.py @@ -26,7 +26,7 @@ _DATASETNAME = "ask_a_patient" -_TAGS = [Tags.SOCIAL_MEDIA] +_TAGS = [Tags.SOCIAL_MEDIA, Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bc5cdr/bc5cdr.py b/bigbio/biodatasets/bc5cdr/bc5cdr.py index 5e729b270..45ed49a77 100644 --- a/bigbio/biodatasets/bc5cdr/bc5cdr.py +++ b/bigbio/biodatasets/bc5cdr/bc5cdr.py @@ -35,7 +35,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [Tags.DISEASE, Tags.CHEMICAL, Tags.CHEMICAL_DISEASE_RELATION, Tags.MESH] +_TAGS = [Tags.DISEASE, Tags.CHEMICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py b/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py index 680de353c..7fd13d83f 100644 --- a/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py +++ b/bigbio/biodatasets/bioasq_2021_mesinesp/bioasq_2021_mesinesp.py @@ -54,7 +54,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.DECS] +_TAGS = [Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py b/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py index b17ed3828..685ac4e45 100644 --- a/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py +++ b/bigbio/biodatasets/bioasq_task_b/bioasq_task_b.py @@ -35,7 +35,13 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [ + Tags.YESNO, + Tags.FACTOID, + Tags.FACTOID_LIST, + Tags.ABSTRACTIVE, + Tags.EXTRACTIVE, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py b/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py index 41e8cca74..282050c63 100644 --- a/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py +++ b/bigbio/biodatasets/biology_how_why_corpus/biology_how_why_corpus.py @@ -33,7 +33,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.QA_HOW, Tags.QA_WHY] +_TAGS = [Tags.HOW, Tags.WHY] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/biomrc/biomrc.py b/bigbio/biodatasets/biomrc/biomrc.py index df849298a..43dd1f725 100644 --- a/bigbio/biodatasets/biomrc/biomrc.py +++ b/bigbio/biodatasets/biomrc/biomrc.py @@ -34,7 +34,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.QA_MULTIPLE_CHOICE, Tags.MRC, Tags.QA_CLOZE] +_TAGS = [Tags.MULTIPLE_CHOICE, Tags.MRC, Tags.CLOZE_TEST] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/cadec/cadec.py b/bigbio/biodatasets/cadec/cadec.py index 13784fd6f..c604c0920 100644 --- a/bigbio/biodatasets/cadec/cadec.py +++ b/bigbio/biodatasets/cadec/cadec.py @@ -38,7 +38,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [] +_TAGS = [Tags.SOCIAL_MEDIA, Tags.DISEASE, Tags.ADR, Tags.DRUG] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cantemist/cantemist.py b/bigbio/biodatasets/cantemist/cantemist.py index 9d0c9d897..9b4af0460 100644 --- a/bigbio/biodatasets/cantemist/cantemist.py +++ b/bigbio/biodatasets/cantemist/cantemist.py @@ -34,7 +34,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CANCER, Tags.DISEASE, Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cas/cas.py b/bigbio/biodatasets/cas/cas.py index 6c421ca86..6b45d7d2b 100644 --- a/bigbio/biodatasets/cas/cas.py +++ b/bigbio/biodatasets/cas/cas.py @@ -9,7 +9,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.POS] _LANGUAGES = [Lang.FR] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/cellfinder/cellfinder.py b/bigbio/biodatasets/cellfinder/cellfinder.py index 9987ee5f6..04b36b529 100644 --- a/bigbio/biodatasets/cellfinder/cellfinder.py +++ b/bigbio/biodatasets/cellfinder/cellfinder.py @@ -31,7 +31,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE, Tags.CELL, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chebi_nactem/chebi_nactem.py b/bigbio/biodatasets/chebi_nactem/chebi_nactem.py index b7edd94f5..aeb5f48bf 100644 --- a/bigbio/biodatasets/chebi_nactem/chebi_nactem.py +++ b/bigbio/biodatasets/chebi_nactem/chebi_nactem.py @@ -25,7 +25,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import parse_brat_file -_TAGS = [] +_TAGS = [Tags.GENE, Tags.CHEMICAL, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chemdner/chemdner.py b/bigbio/biodatasets/chemdner/chemdner.py index c1ec7c880..4e237b6b9 100644 --- a/bigbio/biodatasets/chemdner/chemdner.py +++ b/bigbio/biodatasets/chemdner/chemdner.py @@ -26,7 +26,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chemprot/chemprot.py b/bigbio/biodatasets/chemprot/chemprot.py index c91d5aa81..fc2aa6793 100644 --- a/bigbio/biodatasets/chemprot/chemprot.py +++ b/bigbio/biodatasets/chemprot/chemprot.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/chia/chia.py b/bigbio/biodatasets/chia/chia.py index cc1b3d7ab..da93b98d4 100644 --- a/bigbio/biodatasets/chia/chia.py +++ b/bigbio/biodatasets/chia/chia.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.DRUG, Tags.PROCEDURE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py b/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py index 63efad002..28169f96f 100644 --- a/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py +++ b/bigbio/biodatasets/citation_gia_test_collection/citation_gia_test_collection.py @@ -27,7 +27,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False @@ -60,11 +60,11 @@ _URLS = { _DATASETNAME: [ - "https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/download/GNormPlus/GNormPlusCorpus.zip"] + "https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/download/GNormPlus/GNormPlusCorpus.zip" + ] } -_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, - Tasks.NAMED_ENTITY_DISAMBIGUATION] +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION] _SOURCE_VERSION = "1.0.0" @@ -73,8 +73,8 @@ class CitationGIATestCollection(datasets.GeneratorBasedBuilder): """ - The Citation GIA Test Collection was recently created for gene indexing at the NLM and includes - 151 PubMed abstracts with both mention-level and document-level annotations. + The Citation GIA Test Collection was recently created for gene indexing at the NLM and includes + 151 PubMed abstracts with both mention-level and document-level annotations. They are selected because both have a focus on human genes. """ @@ -95,7 +95,7 @@ class CitationGIATestCollection(datasets.GeneratorBasedBuilder): description="citation_gia_test_collection BigBio schema", schema="bigbio_kb", subset_id="citation_gia_test_collection", - ) + ), ] DEFAULT_CONFIG_NAME = "citation_gia_test_collection_source" @@ -127,7 +127,7 @@ def _info(self) -> datasets.DatasetInfo: } ], } - ] + ], } ) @@ -151,16 +151,18 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ - "filepath": os.path.join(data_dir[0], "GNormPlusCorpus/NLMIAT.BioC.xml"), + "filepath": os.path.join( + data_dir[0], "GNormPlusCorpus/NLMIAT.BioC.xml" + ), "split": "NLMIAT", }, ), ] def _get_entities(self, annot_d: dict) -> dict: - '''' + """' Converts annotation dict to entity dict. - ''' + """ ent = { "id": str(uuid.uuid4()), "type": annot_d["type"], @@ -176,13 +178,15 @@ def _get_entities(self, annot_d: dict) -> dict: return ent - def _get_offsets_entities(child, parent_text: str, child_text: str, offset: int) -> List[int]: - ''' - Extracts child text offsets from parent text for entities. + def _get_offsets_entities( + child, parent_text: str, child_text: str, offset: int + ) -> List[int]: + """ + Extracts child text offsets from parent text for entities. Some offsets that were present in the datset were wrong mainly because of string encodings. - Also a little fraction of parent strings doesn't contain its respective child strings. - Hence few assertion errors in the entitity offsets checking test. - ''' + Also a little fraction of parent strings doesn't contain its respective child strings. + Hence few assertion errors in the entitity offsets checking test. + """ if child_text in parent_text: index = parent_text.index(child_text) start = index + offset @@ -194,10 +198,10 @@ def _get_offsets_entities(child, parent_text: str, child_text: str, offset: int) return [start, end] def _process_annot(self, annot: ET.Element, passages: dict) -> dict: - '''' + """' Converts annotation XML Element to Python dict. - ''' - parent_text = " ".join([p['text'] for p in passages.values()]) + """ + parent_text = " ".join([p["text"] for p in passages.values()]) annot_d = dict() a_d = {a.tag: a.text for a in annot} @@ -206,21 +210,21 @@ def _process_annot(self, annot: ET.Element, passages: dict) -> dict: if a.tag == "location": offset = int(a.attrib["offset"]) annot_d["offsets"] = self._get_offsets_entities( - html.escape(parent_text[offset:]), - html.escape(a_d["text"]), offset) + html.escape(parent_text[offset:]), html.escape(a_d["text"]), offset + ) elif a.tag != "infon": annot_d[a.tag] = html.escape(a.text) else: annot_d[a.attrib["key"]] = html.escape(a.text) - + return annot_d def _parse_elem(self, elem: ET.Element) -> dict: - '''' + """' Converts document XML Element to Python dict. - ''' + """ elem_d = dict() passages = dict() annotations = elem.findall(".//annotation") @@ -231,8 +235,21 @@ def _parse_elem(self, elem: ET.Element) -> dict: for child in elem: if child.tag == "passage": - elem_d[child.tag].append({c.tag: html.escape(" ".join(list(filter( - lambda item: item, [t.strip('\n') for t in c.itertext()])))) for c in child}) + elem_d[child.tag].append( + { + c.tag: html.escape( + " ".join( + list( + filter( + lambda item: item, + [t.strip("\n") for t in c.itertext()], + ) + ) + ) + ) + for c in child + } + ) elif child.tag == "id": elem_d[child.tag] = html.escape(child.text) @@ -243,11 +260,10 @@ def _parse_elem(self, elem: ET.Element) -> dict: passages[infon] = passage elem_d["passages"] = passages - elem_d.pop('passage', None) + elem_d.pop("passage", None) for a in annotations: - elem_d["entities"].append( - self._process_annot(a, elem_d["passages"])) + elem_d["entities"].append(self._process_annot(a, elem_d["passages"])) return elem_d @@ -261,31 +277,35 @@ def _generate_examples(self, filepath, split): row = self._parse_elem(elem) uid += 1 passages = row["passages"] - yield uid, { + yield uid, { "id": str(uid), "passages": [ { "id": str(uuid.uuid4()), "type": "title", "text": [passages["title"]["text"]], - "offsets": [[ - int(passages["title"]["offset"]), - int(passages["title"]["offset"]) + - len(passages["title"]["text"]) - ]], + "offsets": [ + [ + int(passages["title"]["offset"]), + int(passages["title"]["offset"]) + + len(passages["title"]["text"]), + ] + ], }, { "id": str(uuid.uuid4()), "type": "abstract", "text": [passages["abstract"]["text"]], - "offsets": [[ - int(passages["abstract"]["offset"]), - int(passages["abstract"]["offset"]) + - len(passages["abstract"]["text"]) - ]], - } + "offsets": [ + [ + int(passages["abstract"]["offset"]), + int(passages["abstract"]["offset"]) + + len(passages["abstract"]["text"]), + ] + ], + }, ], - "entities": [self._get_entities(a) for a in row["entities"]] + "entities": [self._get_entities(a) for a in row["entities"]], } elif self.config.schema == "bigbio_kb": @@ -294,7 +314,7 @@ def _generate_examples(self, filepath, split): row = self._parse_elem(elem) uid += 1 passages = row["passages"] - yield uid, { + yield uid, { "id": str(uid), "document_id": str(uuid.uuid4()), "passages": [ @@ -302,26 +322,29 @@ def _generate_examples(self, filepath, split): "id": str(uuid.uuid4()), "type": "title", "text": [passages["title"]["text"]], - "offsets": [[ - int(passages["title"]["offset"]), - int(passages["title"]["offset"]) + - len(passages["title"] - ["text"]) - ]], + "offsets": [ + [ + int(passages["title"]["offset"]), + int(passages["title"]["offset"]) + + len(passages["title"]["text"]), + ] + ], }, { "id": str(uuid.uuid4()), "type": "abstract", "text": [passages["abstract"]["text"]], - "offsets": [[ - int(passages["abstract"]["offset"]), - int(passages["abstract"]["offset"]) + - len(passages["abstract"]["text"]) - ]], - } + "offsets": [ + [ + int(passages["abstract"]["offset"]), + int(passages["abstract"]["offset"]) + + len(passages["abstract"]["text"]), + ] + ], + }, ], "entities": [self._get_entities(a) for a in row["entities"]], "relations": [], "events": [], - "coreferences": [] + "coreferences": [], } diff --git a/bigbio/biodatasets/codiesp/codiesp.py b/bigbio/biodatasets/codiesp/codiesp.py index aea9c7860..65671fcd8 100644 --- a/bigbio/biodatasets/codiesp/codiesp.py +++ b/bigbio/biodatasets/codiesp/codiesp.py @@ -38,7 +38,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.DOCUMENT_INDEXING, Tags.PROCEDURE] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cord_ner/cord_ner.py b/bigbio/biodatasets/cord_ner/cord_ner.py index 38f956daa..5457155d7 100644 --- a/bigbio/biodatasets/cord_ner/cord_ner.py +++ b/bigbio/biodatasets/cord_ner/cord_ner.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [] +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.CHEMICAL, Tags.COVID, Tags.ORGANISM] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ctebmsp/ctebmsp.py b/bigbio/biodatasets/ctebmsp/ctebmsp.py index 42c23ef23..f5a3fc2b8 100644 --- a/bigbio/biodatasets/ctebmsp/ctebmsp.py +++ b/bigbio/biodatasets/ctebmsp/ctebmsp.py @@ -34,7 +34,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.ANATOMY, Tags.CHEMICAL, Tags.DISEASE, Tags.PROCEDURE] _LANGUAGES = [Lang.ES] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ddi_corpus/ddi_corpus.py b/bigbio/biodatasets/ddi_corpus/ddi_corpus.py index 970cdbb6b..7ff25476b 100644 --- a/bigbio/biodatasets/ddi_corpus/ddi_corpus.py +++ b/bigbio/biodatasets/ddi_corpus/ddi_corpus.py @@ -30,7 +30,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DDI, Tags.DRUG] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py b/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py index 9ae958463..8dcc4ac1c 100644 --- a/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py +++ b/bigbio/biodatasets/diann_iber_eval/diann_iber_eval.py @@ -30,7 +30,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DOCUMENT_INDEXING, Tags.DISEASE] _LANGUAGES = [Lang.EN, Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/distemist/distemist.py b/bigbio/biodatasets/distemist/distemist.py index 798c568a0..1471653f4 100644 --- a/bigbio/biodatasets/distemist/distemist.py +++ b/bigbio/biodatasets/distemist/distemist.py @@ -24,7 +24,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/ebm_pico/ebm_pico.py b/bigbio/biodatasets/ebm_pico/ebm_pico.py index f20a3379d..0abb19048 100644 --- a/bigbio/biodatasets/ebm_pico/ebm_pico.py +++ b/bigbio/biodatasets/ebm_pico/ebm_pico.py @@ -29,7 +29,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.PICO, Tags.POS] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False @@ -66,7 +66,9 @@ _LICENSE = Licenses.UNKNOWN -_URLS = {_DATASETNAME: "https://github.com/bepnye/EBM-NLP/raw/master/ebm_nlp_2_00.tar.gz"} +_URLS = { + _DATASETNAME: "https://github.com/bepnye/EBM-NLP/raw/master/ebm_nlp_2_00.tar.gz" +} _SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] @@ -139,7 +141,9 @@ def _partition(alist, indices): for _indices in multiple_indices: high_level_type = LABEL_DECODERS["starting_spans"][annotation_type][1] - fine_grained_type = LABEL_DECODERS["hierarchical_labels"][annotation_type][annotations[_indices[0]]] + fine_grained_type = LABEL_DECODERS["hierarchical_labels"][ + annotation_type + ][annotations[_indices[0]]] annotation_text = " ".join([tokenized[ind] for ind in _indices]) char_start = document_content.find(annotation_text) @@ -222,7 +226,9 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: data_dir = dl_manager.download_and_extract(urls) documents_folder = Path(data_dir) / "ebm_nlp_2_00" / "documents" - annotations_folder = Path(data_dir) / "ebm_nlp_2_00" / "annotations" / "aggregated" + annotations_folder = ( + Path(data_dir) / "ebm_nlp_2_00" / "annotations" / "aggregated" + ) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, @@ -242,7 +248,9 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: ), ] - def _generate_examples(self, documents_folder, annotations_folder, split_folder: str) -> Tuple[int, Dict]: + def _generate_examples( + self, documents_folder, annotations_folder, split_folder: str + ) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" annotation_types = ["interventions", "outcomes", "participants"] @@ -265,11 +273,15 @@ def _generate_examples(self, documents_folder, annotations_folder, split_folder: with open( f"{annotations_folder}/hierarchical_labels/{annotation_type}/{split_folder}/{document}" ) as fp: - annotation_dict[annotation_type] = [int(x) for x in fp.read().splitlines()] + annotation_dict[annotation_type] = [ + int(x) for x in fp.read().splitlines() + ] except OSError: annotation_dict[annotation_type] = [] - ents = _get_entities_pico(annotation_dict, tokenized=tokenized, document_content=document_content) + ents = _get_entities_pico( + annotation_dict, tokenized=tokenized, document_content=document_content + ) if self.config.schema == "source": @@ -280,7 +292,9 @@ def _generate_examples(self, documents_folder, annotations_folder, split_folder: { "text": ent["annotation_text"], "annotation_type": ent["high_level_annotation_type"], - "fine_grained_annotation_type": ent["fine_grained_annotation_type"], + "fine_grained_annotation_type": ent[ + "fine_grained_annotation_type" + ], "start": ent["char_start"], "end": ent["char_end"], } diff --git a/bigbio/biodatasets/ehr_rel/ehr_rel.py b/bigbio/biodatasets/ehr_rel/ehr_rel.py index 2ad2f965a..f9b0967e8 100644 --- a/bigbio/biodatasets/ehr_rel/ehr_rel.py +++ b/bigbio/biodatasets/ehr_rel/ehr_rel.py @@ -31,7 +31,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.CONCEPT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/essai/essai.py b/bigbio/biodatasets/essai/essai.py index 289055a63..aab446381 100644 --- a/bigbio/biodatasets/essai/essai.py +++ b/bigbio/biodatasets/essai/essai.py @@ -9,7 +9,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.DISEASE, Tags.PROCEDURE] _LANGUAGES = [Lang.FR] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/euadr/euadr.py b/bigbio/biodatasets/euadr/euadr.py index e68a1feb1..6923b5ad5 100644 --- a/bigbio/biodatasets/euadr/euadr.py +++ b/bigbio/biodatasets/euadr/euadr.py @@ -7,7 +7,15 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [ + Tags.ADR, + Tags.DRUG, + Tags.GENE, + Tags.DISEASE, + Tags.VARIANT, + Tags.NEGATION, + Tags.SPECULATION, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/evidence_inference/evidence_inference.py b/bigbio/biodatasets/evidence_inference/evidence_inference.py index e21ce4f47..d17594ca8 100644 --- a/bigbio/biodatasets/evidence_inference/evidence_inference.py +++ b/bigbio/biodatasets/evidence_inference/evidence_inference.py @@ -35,7 +35,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.PROCEDURE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/genetag/genetag.py b/bigbio/biodatasets/genetag/genetag.py index bfe13bf53..2faf3558e 100644 --- a/bigbio/biodatasets/genetag/genetag.py +++ b/bigbio/biodatasets/genetag/genetag.py @@ -32,7 +32,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py b/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py index 81c833687..f010eb3a2 100644 --- a/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py +++ b/bigbio/biodatasets/genia_relation_corpus/genia_relation_corpus.py @@ -34,7 +34,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE, Tags.PART_OF] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py b/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py index 7516e830d..66b55cf8e 100644 --- a/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py +++ b/bigbio/biodatasets/genia_term_corpus/genia_term_corpus.py @@ -31,7 +31,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE, Tags.CELL, Tags.ANATOMY, Tags.TISSUE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/gnormplus/gnormplus.py b/bigbio/biodatasets/gnormplus/gnormplus.py index 28d16d360..fc1a2367b 100644 --- a/bigbio/biodatasets/gnormplus/gnormplus.py +++ b/bigbio/biodatasets/gnormplus/gnormplus.py @@ -27,7 +27,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [] +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py b/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py index 73439fe04..973bf970a 100644 --- a/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py +++ b/bigbio/biodatasets/hallmarks_of_cancer/hallmarks_of_cancer.py @@ -21,7 +21,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.CANCER] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False @@ -66,7 +66,7 @@ _URLs = { "corpus": "https://github.com/sb895/Hallmarks-of-Cancer/archive/refs/heads/master.zip", - "split_indices": "https://microsoft.github.io/BLURB/sample_code/data_generation.tar.gz" + "split_indices": "https://microsoft.github.io/BLURB/sample_code/data_generation.tar.gz", } _SUPPORTED_TASKS = [Tasks.TEXT_CLASSIFICATION] @@ -74,17 +74,17 @@ _BIGBIO_VERSION = "1.0.0" _CLASS_NAMES = [ - 'evading growth suppressors', - 'tumor promoting inflammation', - 'enabling replicative immortality', - 'cellular energetics', - 'resisting cell death', - 'activating invasion and metastasis', - 'genomic instability and mutation', - 'none', - 'inducing angiogenesis', - 'sustaining proliferative signaling', - 'avoiding immune destruction' + "evading growth suppressors", + "tumor promoting inflammation", + "enabling replicative immortality", + "cellular energetics", + "resisting cell death", + "activating invasion and metastasis", + "genomic instability and mutation", + "none", + "inducing angiogenesis", + "sustaining proliferative signaling", + "avoiding immune destruction", ] @@ -144,21 +144,24 @@ def _split_generators(self, dl_manager): name=datasets.Split.TRAIN, gen_kwargs={ "corpuspath": Path(data_dir["corpus"]), - "indicespath": Path(data_dir["split_indices"]) / "data_generation/indexing/HoC/train_pmid.tsv" + "indicespath": Path(data_dir["split_indices"]) + / "data_generation/indexing/HoC/train_pmid.tsv", }, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "corpuspath": Path(data_dir["corpus"]), - "indicespath": Path(data_dir["split_indices"]) / "data_generation/indexing/HoC/test_pmid.tsv" + "indicespath": Path(data_dir["split_indices"]) + / "data_generation/indexing/HoC/test_pmid.tsv", }, ), datasets.SplitGenerator( name=datasets.Split.VALIDATION, gen_kwargs={ "corpuspath": Path(data_dir["corpus"]), - "indicespath": Path(data_dir["split_indices"]) / "data_generation/indexing/HoC/dev_pmid.tsv" + "indicespath": Path(data_dir["split_indices"]) + / "data_generation/indexing/HoC/dev_pmid.tsv", }, ), ] @@ -184,13 +187,15 @@ def _generate_examples(self, corpuspath: Path, indicespath: Path): sentence, label = example_pair label = label.strip() - + if label == "": label = "none" multi_labels = [m_label.strip() for m_label in label.split("AND")] unique_multi_labels = { - m_label.split("--")[0].lower().lstrip() for m_label in multi_labels if m_label != "NULL" + m_label.split("--")[0].lower().lstrip() + for m_label in multi_labels + if m_label != "NULL" } arrow_file_unique_key = 100 * document_index + example_index diff --git a/bigbio/biodatasets/hprd50/hprd50.py b/bigbio/biodatasets/hprd50/hprd50.py index 63de60017..834bc1c5d 100644 --- a/bigbio/biodatasets/hprd50/hprd50.py +++ b/bigbio/biodatasets/hprd50/hprd50.py @@ -42,7 +42,7 @@ from bigbio.utils.license import Licenses # TODO: Add BibTeX citation -_TAGS = [] +_TAGS = [Tags.GENE, Tags.PPI] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/iepa/iepa.py b/bigbio/biodatasets/iepa/iepa.py index 157893562..be945fb60 100644 --- a/bigbio/biodatasets/iepa/iepa.py +++ b/bigbio/biodatasets/iepa/iepa.py @@ -33,7 +33,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.DRUG, Tags.DDI] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/jnlpba/jnlpba.py b/bigbio/biodatasets/jnlpba/jnlpba.py index a10a42981..9e03eaea1 100644 --- a/bigbio/biodatasets/jnlpba/jnlpba.py +++ b/bigbio/biodatasets/jnlpba/jnlpba.py @@ -29,7 +29,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE, Tags.CELL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/linnaeus/linnaeus.py b/bigbio/biodatasets/linnaeus/linnaeus.py index 4a079d1d9..805188879 100644 --- a/bigbio/biodatasets/linnaeus/linnaeus.py +++ b/bigbio/biodatasets/linnaeus/linnaeus.py @@ -35,7 +35,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/lll/lll.py b/bigbio/biodatasets/lll/lll.py index ccc4eca80..560185a5f 100644 --- a/bigbio/biodatasets/lll/lll.py +++ b/bigbio/biodatasets/lll/lll.py @@ -39,7 +39,7 @@ from bigbio.utils.constants import BigBioValues, Lang, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mantra_gsc/mantra_gsc.py b/bigbio/biodatasets/mantra_gsc/mantra_gsc.py index 0db20bd50..cf572db03 100644 --- a/bigbio/biodatasets/mantra_gsc/mantra_gsc.py +++ b/bigbio/biodatasets/mantra_gsc/mantra_gsc.py @@ -25,7 +25,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.PROCEDURE] _LANGUAGES = [Lang.EN, Lang.FR, Lang.DE, Lang.NL, Lang.ES] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mayosrs/mayosrs.py b/bigbio/biodatasets/mayosrs/mayosrs.py index 160a66668..e0b63b87e 100644 --- a/bigbio/biodatasets/mayosrs/mayosrs.py +++ b/bigbio/biodatasets/mayosrs/mayosrs.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CONCEPT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/med_qa/med_qa.py b/bigbio/biodatasets/med_qa/med_qa.py index 4cdbc1d96..5e000263c 100644 --- a/bigbio/biodatasets/med_qa/med_qa.py +++ b/bigbio/biodatasets/med_qa/med_qa.py @@ -32,7 +32,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.MULTIPLE_CHOICE, Tags.ABSTRACTIVE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/meddialog/meddialog.py b/bigbio/biodatasets/meddialog/meddialog.py index 4d0e95b64..ee647d081 100644 --- a/bigbio/biodatasets/meddialog/meddialog.py +++ b/bigbio/biodatasets/meddialog/meddialog.py @@ -25,7 +25,7 @@ _DATASETNAME = "meddialog" -_TAGS = [] +_TAGS = [Tags.DIALOGUE] _LANGUAGES = [Lang.EN, Lang.ZH] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/meddocan/meddocan.py b/bigbio/biodatasets/meddocan/meddocan.py index e1fb393d1..d2dc14e9b 100644 --- a/bigbio/biodatasets/meddocan/meddocan.py +++ b/bigbio/biodatasets/meddocan/meddocan.py @@ -32,7 +32,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.ANONYMIZATION] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/medhop/medhop.py b/bigbio/biodatasets/medhop/medhop.py index 96c926399..196490081 100644 --- a/bigbio/biodatasets/medhop/medhop.py +++ b/bigbio/biodatasets/medhop/medhop.py @@ -23,7 +23,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.MULTIPLE_CHOICE, Tags.MRC] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/medical_data/medical_data.py b/bigbio/biodatasets/medical_data/medical_data.py index 80ddfdef7..48929faa6 100644 --- a/bigbio/biodatasets/medical_data/medical_data.py +++ b/bigbio/biodatasets/medical_data/medical_data.py @@ -24,7 +24,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DRUG, Tags.SENTIMENT_ANALYSIS] _LANGUAGES = [Lang.EN] _LOCAL = True _CITATION = """\ diff --git a/bigbio/biodatasets/mediqa_qa/mediqa_qa.py b/bigbio/biodatasets/mediqa_qa/mediqa_qa.py index 1c26254e7..5af9b45b2 100644 --- a/bigbio/biodatasets/mediqa_qa/mediqa_qa.py +++ b/bigbio/biodatasets/mediqa_qa/mediqa_qa.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.FACTOID, Tags.DISEASE, Tags.DRUG] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/medmentions/medmentions.py b/bigbio/biodatasets/medmentions/medmentions.py index a1322f7e1..9c9746635 100644 --- a/bigbio/biodatasets/medmentions/medmentions.py +++ b/bigbio/biodatasets/medmentions/medmentions.py @@ -46,7 +46,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.CHEMICAL, Tags.ORGANISM] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/utils/resources/tags.json b/bigbio/utils/resources/tags.json index f2daadbbb..c4e07a69c 100644 --- a/bigbio/utils/resources/tags.json +++ b/bigbio/utils/resources/tags.json @@ -8,27 +8,37 @@ "SPECIES" : "Species", "GENE" : "Gene, proteins, gene products, ...", "DISEASE" : "Disease", + "DRUG" : "Drug", "CHEMICAL" : "Chemical", - "UMLS" : "Unified Medical Language System", "COVID" : "Coronavirus disease 2019 (COVID-19)", "LEXICAL" : "Lexical data (e.g. word, verbs,...)", - "DECS" : "Descriptores en Ciencias de la Salud", - "QA_YESNO" : "QA with yes no answer", - "QA_FACTOID" : "QA with factoid answer", - "QA_LIST": "QA with list of factoid answer", - "QA_SUMMARY_ANSWER" : "QA with abstractive summary answer", - "QA_HOW" : "`How` question", - "QA_WHY" : "`Why` question", + "YESNO" : "QA with yes no answer", + "HOW" : "`How` question", + "WHY" : "`Why` question", + "FACTOID" : "QA with factoid answer", + "FACTOIND_LIST": "QA with list of factoid answer", + "ABSTRACTIVE" : "Abstractive summary/answer", + "EXTRACTIVE" : "Extractive summary/answer", + "CLOZE_TEST" : "Cloze test", "GRANT" : "Grants data", "PPI" : "Protein-protein interaction", - "QA_CLOZE" : "Cloze test", "MRC" : "Machine Reading Comprehension", - "QA_MULTIPLE_CHOICE" : "QA with multiple choice", + "MULTIPLE_CHOICE" : "QA with multiple choice", "NEGATION" : "Negation", "SPECULATION" : "Speculation", "EPIGENETICS" : "Epigenetics", "PART_OF" : "Part-of relations", "CANCER" : "Cancer", - "PATHWAY" : "Pathway", - "MESH" : "Medical Subject Headings (MeSH)" + "PATHWAY_CURATION" : "Pathway curation", + "DOCUMENT_INDEXING" : "Document indexing", + "ADR" : "Adverse Drug Reaction", + "POS" : "Part of Speech Tagging", + "PICO" : "(P)articipants, (I)nterventions, and (O)utcomes", + "DDI" : "Drug-drug interaction", + "CONCEPT" : "Concept, Multi-word expression (MWE)", + "SENTENCE" : "Sentence", + "PROCEDURE" : "Procedure, treatment", + "DIALOGUE" : "Dialogue", + "ANONYMIZATION" : "Anonymizatio (De-identification)" + "SENTIMENT_ANALYSIS" : "Sentiment analysis" } From 9ebbdf4d2cfd77f2a26c7a4b526751a0541fbb80 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 13:09:42 +0200 Subject: [PATCH 05/20] new tags --- bigbio/utils/resources/tags.json | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bigbio/utils/resources/tags.json b/bigbio/utils/resources/tags.json index c4e07a69c..2dcc382e1 100644 --- a/bigbio/utils/resources/tags.json +++ b/bigbio/utils/resources/tags.json @@ -2,6 +2,7 @@ "SOCIAL_MEDIA" : "Social media", "ANATOMY" : "Anatomy", "ORGANISM" : "Organism", + "ORGAN" : "Organ", "VARIANT" : "Variant/Mutation", "TISSUE" : "Tissue", "CELL" : "Cells and/or cell lines", @@ -39,6 +40,8 @@ "SENTENCE" : "Sentence", "PROCEDURE" : "Procedure, treatment", "DIALOGUE" : "Dialogue", - "ANONYMIZATION" : "Anonymizatio (De-identification)" - "SENTIMENT_ANALYSIS" : "Sentiment analysis" + "ANONYMIZATION" : "Anonymizatio (De-identification)", + "SENTIMENT_ANALYSIS" : "Sentiment analysis", + "MIRNA" : "miRNA", + "ABBREVIATION" : "Abbreviation" } From 02deb9a37a62ecb926773c5cf0b2ecf0a2cac1c2 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 13:10:50 +0200 Subject: [PATCH 06/20] add tags --- bigbio/biodatasets/medal/medal.py | 15 +++++++++------ bigbio/biodatasets/meqsum/meqsum.py | 2 +- bigbio/biodatasets/minimayosrs/minimayosrs.py | 2 +- bigbio/biodatasets/mlee/mlee.py | 2 +- bigbio/biodatasets/msh_wsd/msh_wsd.py | 2 +- bigbio/biodatasets/muchmore/muchmore.py | 2 +- .../biodatasets/multi_xscience/multi_xscience.py | 2 +- .../mutation_finder/mutation_finder.py | 2 +- bigbio/biodatasets/nagel/nagel.py | 2 +- bigbio/biodatasets/ncbi_disease/ncbi_disease.py | 2 +- bigbio/biodatasets/nlm_gene/nlm_gene.py | 2 +- bigbio/biodatasets/nlm_wsd/nlm_wsd.py | 2 +- bigbio/biodatasets/nlmchem/nlmchem.py | 2 +- .../ntcir_13_medweb/ntcir_13_medweb.py | 2 +- bigbio/biodatasets/osiris/osiris.py | 2 +- bigbio/biodatasets/pcr/pcr.py | 2 +- bigbio/biodatasets/pdr/pdr.py | 2 +- 17 files changed, 25 insertions(+), 22 deletions(-) diff --git a/bigbio/biodatasets/medal/medal.py b/bigbio/biodatasets/medal/medal.py index 03df40fc3..0ceeb8cb4 100644 --- a/bigbio/biodatasets/medal/medal.py +++ b/bigbio/biodatasets/medal/medal.py @@ -31,7 +31,7 @@ logger = datasets.logging.get_logger(__name__) -_TAGS = [] +_TAGS = [Tags.ABBREVIATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False @@ -74,10 +74,11 @@ _BIGBIO_VERSION = "1.0.0" + class MedalDataset(datasets.GeneratorBasedBuilder): """The Repository for Medical Dataset for Abbreviation Disambiguation for Natural Language Understanding (MeDAL) is -a large medical text dataset curated for abbreviation disambiguation, designed for natural language understanding -pre-training in the medical domain.""" + a large medical text dataset curated for abbreviation disambiguation, designed for natural language understanding + pre-training in the medical domain.""" SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) @@ -124,7 +125,9 @@ def _info(self) -> datasets.DatasetInfo: citation=_CITATION, ) - def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + def _split_generators( + self, dl_manager: datasets.DownloadManager + ) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" urls = _URLS @@ -169,7 +172,7 @@ def _generate_offsets(self, text, location): Returns ------- - dict + dict "word": str, "offsets": tuple (int, int) """ @@ -179,7 +182,7 @@ def _generate_offsets(self, text, location): offset_end = offset_start + len(word) # return word and offsets - return {"word":word, "offsets":(offset_start, offset_end)} + return {"word": word, "offsets": (offset_start, offset_end)} def _generate_examples(self, filepath, split: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" diff --git a/bigbio/biodatasets/meqsum/meqsum.py b/bigbio/biodatasets/meqsum/meqsum.py index 21fe7f58c..a2a3d8bec 100644 --- a/bigbio/biodatasets/meqsum/meqsum.py +++ b/bigbio/biodatasets/meqsum/meqsum.py @@ -33,7 +33,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.ABSTRACTIVE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/minimayosrs/minimayosrs.py b/bigbio/biodatasets/minimayosrs/minimayosrs.py index f8f095bbe..cd2eba509 100644 --- a/bigbio/biodatasets/minimayosrs/minimayosrs.py +++ b/bigbio/biodatasets/minimayosrs/minimayosrs.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CONCEPT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/mlee/mlee.py b/bigbio/biodatasets/mlee/mlee.py index 478079624..5582f2193 100644 --- a/bigbio/biodatasets/mlee/mlee.py +++ b/bigbio/biodatasets/mlee/mlee.py @@ -32,7 +32,7 @@ _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" -_TAGS = [] +_TAGS = [Tags.GENE, Tags.DRUG, Tags.CELL, Tags.ORGAN, Tags.TISSUE, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/msh_wsd/msh_wsd.py b/bigbio/biodatasets/msh_wsd/msh_wsd.py index 2195106ac..b47656330 100644 --- a/bigbio/biodatasets/msh_wsd/msh_wsd.py +++ b/bigbio/biodatasets/msh_wsd/msh_wsd.py @@ -43,7 +43,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.ABBREVIATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/muchmore/muchmore.py b/bigbio/biodatasets/muchmore/muchmore.py index 3ae9d047d..bc5d1335e 100644 --- a/bigbio/biodatasets/muchmore/muchmore.py +++ b/bigbio/biodatasets/muchmore/muchmore.py @@ -76,7 +76,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.POS] _LANGUAGES = [Lang.EN, Lang.DE] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/multi_xscience/multi_xscience.py b/bigbio/biodatasets/multi_xscience/multi_xscience.py index 6be1347be..a5f9fcd3e 100644 --- a/bigbio/biodatasets/multi_xscience/multi_xscience.py +++ b/bigbio/biodatasets/multi_xscience/multi_xscience.py @@ -24,7 +24,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.ABSTRACTIVE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/mutation_finder/mutation_finder.py b/bigbio/biodatasets/mutation_finder/mutation_finder.py index 5dc113a7c..e14b715a9 100644 --- a/bigbio/biodatasets/mutation_finder/mutation_finder.py +++ b/bigbio/biodatasets/mutation_finder/mutation_finder.py @@ -23,7 +23,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [] +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/nagel/nagel.py b/bigbio/biodatasets/nagel/nagel.py index fd8a05f68..0f5990ff7 100644 --- a/bigbio/biodatasets/nagel/nagel.py +++ b/bigbio/biodatasets/nagel/nagel.py @@ -26,7 +26,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [] +_TAGS = [Tags.MUTATION, Tags.GENE, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ncbi_disease/ncbi_disease.py b/bigbio/biodatasets/ncbi_disease/ncbi_disease.py index 1efee20e5..c2b1d7487 100644 --- a/bigbio/biodatasets/ncbi_disease/ncbi_disease.py +++ b/bigbio/biodatasets/ncbi_disease/ncbi_disease.py @@ -29,7 +29,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/nlm_gene/nlm_gene.py b/bigbio/biodatasets/nlm_gene/nlm_gene.py index 1a6c0e06f..d084ad477 100644 --- a/bigbio/biodatasets/nlm_gene/nlm_gene.py +++ b/bigbio/biodatasets/nlm_gene/nlm_gene.py @@ -26,7 +26,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [] +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/nlm_wsd/nlm_wsd.py b/bigbio/biodatasets/nlm_wsd/nlm_wsd.py index 7437d8df2..3882db161 100644 --- a/bigbio/biodatasets/nlm_wsd/nlm_wsd.py +++ b/bigbio/biodatasets/nlm_wsd/nlm_wsd.py @@ -56,7 +56,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.ABBREVIATION] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/nlmchem/nlmchem.py b/bigbio/biodatasets/nlmchem/nlmchem.py index e816e3788..10472c244 100644 --- a/bigbio/biodatasets/nlmchem/nlmchem.py +++ b/bigbio/biodatasets/nlmchem/nlmchem.py @@ -26,7 +26,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py b/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py index ff8734739..26e972f20 100644 --- a/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py +++ b/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py @@ -66,7 +66,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.SOCIAL_MEDIA, Tags.SENTIMENT_ANALYSIS] _LANGUAGES = [Lang.EN, Lang.ZH, Lang.JA] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/osiris/osiris.py b/bigbio/biodatasets/osiris/osiris.py index b83262563..19b0872c9 100644 --- a/bigbio/biodatasets/osiris/osiris.py +++ b/bigbio/biodatasets/osiris/osiris.py @@ -27,7 +27,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pcr/pcr.py b/bigbio/biodatasets/pcr/pcr.py index 28e3987e9..8295b1777 100644 --- a/bigbio/biodatasets/pcr/pcr.py +++ b/bigbio/biodatasets/pcr/pcr.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pdr/pdr.py b/bigbio/biodatasets/pdr/pdr.py index 1c7bb9f77..efa60062c 100644 --- a/bigbio/biodatasets/pdr/pdr.py +++ b/bigbio/biodatasets/pdr/pdr.py @@ -31,7 +31,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False From 5d42aebe66ae44456f1f1ed04d0671164ce0fae5 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 13:14:42 +0200 Subject: [PATCH 07/20] add tags --- bigbio/biodatasets/mirna/mirna.py | 742 +++++++++++++++--------------- 1 file changed, 380 insertions(+), 362 deletions(-) diff --git a/bigbio/biodatasets/mirna/mirna.py b/bigbio/biodatasets/mirna/mirna.py index aa7e72793..44babefe0 100644 --- a/bigbio/biodatasets/mirna/mirna.py +++ b/bigbio/biodatasets/mirna/mirna.py @@ -1,366 +1,384 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import xml.etree.ElementTree as ET -from typing import Dict, Iterator, List, Tuple - -import datasets - -from bigbio.utils import schemas -from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import Lang, Tags, Tasks +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import xml.etree.ElementTree as ET +from typing import Dict, Iterator, List, Tuple + +import datasets + +from bigbio.utils import schemas +from bigbio.utils.configs import BigBioConfig +from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses - -_TAGS = [] + +_TAGS = [Tags.MIRNA, Tags.GENE, Tags.DISEASE, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False -_CITATION = """\ -@Article{Bagewadi2014, -author={Bagewadi, Shweta -and Bobi{\'{c}}, Tamara -and Hofmann-Apitius, Martin -and Fluck, Juliane -and Klinger, Roman}, -title={Detecting miRNA Mentions and Relations in Biomedical Literature}, -journal={F1000Research}, -year={2014}, -month={Aug}, -day={28}, -publisher={F1000Research}, -volume={3}, -pages={205-205}, -keywords={MicroRNAs; corpus; prediction algorithms}, -abstract={ - INTRODUCTION: MicroRNAs (miRNAs) have demonstrated their potential as post-transcriptional - gene expression regulators, participating in a wide spectrum of regulatory events such as - apoptosis, differentiation, and stress response. Apart from the role of miRNAs in normal - physiology, their dysregulation is implicated in a vast array of diseases. Dissection of - miRNA-related associations are valuable for contemplating their mechanism in diseases, - leading to the discovery of novel miRNAs for disease prognosis, diagnosis, and therapy. - MOTIVATION: Apart from databases and prediction tools, miRNA-related information is largely - available as unstructured text. Manual retrieval of these associations can be labor-intensive - due to steadily growing number of publications. Additionally, most of the published miRNA - entity recognition methods are keyword based, further subjected to manual inspection for - retrieval of relations. Despite the fact that several databases host miRNA-associations - derived from text, lower sensitivity and lack of published details for miRNA entity - recognition and associated relations identification has motivated the need for developing - comprehensive methods that are freely available for the scientific community. Additionally, - the lack of a standard corpus for miRNA-relations has caused difficulty in evaluating the - available systems. We propose methods to automatically extract mentions of miRNAs, species, - genes/proteins, disease, and relations from scientific literature. Our generated corpora, - along with dictionaries, and miRNA regular expression are freely available for academic - purposes. To our knowledge, these resources are the most comprehensive developed so far. - RESULTS: The identification of specific miRNA mentions reaches a recall of 0.94 and - precision of 0.93. Extraction of miRNA-disease and miRNA-gene relations lead to an - F1 score of up to 0.76. A comparison of the information extracted by our approach to - the databases miR2Disease and miRSel for the extraction of Alzheimer's disease - related relations shows the capability of our proposed methods in identifying correct - relations with improved sensitivity. The published resources and described methods can - help the researchers for maximal retrieval of miRNA-relations and generation of - miRNA-regulatory networks. AVAILABILITY: The training and test corpora, annotation - guidelines, developed dictionaries, and supplementary files are available at - http://www.scai.fraunhofer.de/mirna-corpora.html. -}, -note={26535109[pmid]}, -note={PMC4602280[pmcid]}, -issn={2046-1402}, -url={https://pubmed.ncbi.nlm.nih.gov/26535109}, -language={eng} -} -""" - -_DATASETNAME = "mirna" - -_DESCRIPTION = """\ -The corpus consists of 301 Medline citations. The documents were screened for -mentions of miRNA in the abstract text. Gene, disease and miRNA entities were manually -annotated. The corpus comprises of two separate files, a train and a test set, coming -from 201 and 100 documents respectively. -""" - -_HOMEPAGE = "https://www.scai.fraunhofer.de/en/business-research-areas/bioinformatics/downloads/download-mirna-test-corpus.html" - -_LICENSE = Licenses.CC_BY_NC_3p0 - -_BASE = "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/miRNA/miRNA-" - -_URLs = { - "source": { - "train": _BASE + "Train-Corpus.xml", - "test": _BASE + "Test-Corpus.xml", - }, - "bigbio_kb": { - "train": _BASE + "Train-Corpus.xml", - "test": _BASE + "Test-Corpus.xml", - }, -} - -_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION] -_SOURCE_VERSION = "1.0.0" -_BIGBIO_VERSION = "1.0.0" - - -class miRNADataset(datasets.GeneratorBasedBuilder): - """mirna""" - - SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) - BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) - - BUILDER_CONFIGS = [ - BigBioConfig( - name="mirna_source", - version=SOURCE_VERSION, - description="mirna source schema", - schema="source", - subset_id="mirna", - ), - BigBioConfig( - name="mirna_bigbio_kb", - version=BIGBIO_VERSION, - description="mirna BigBio schema", - schema="bigbio_kb", - subset_id="mirna", - ), - ] - - DEFAULT_CONFIG_NAME = "mirna_source" - - def _info(self): - - if self.config.schema == "source": - - features = datasets.Features( - { - "passages": [ - { - "document_id": datasets.Value("string"), - "type": datasets.Value("string"), - "text": datasets.Value("string"), - "offset": datasets.Value("int32"), - "entities": [ - { - "id": datasets.Value("string"), - "offsets": [[datasets.Value("int32")]], - "text": [datasets.Value("string")], - "type": datasets.Value("string"), - "normalized": [ - { - "db_name": datasets.Value("string"), - "db_id": datasets.Value("string"), - } - ], - } - ], - } - ] - } - ) - - elif self.config.schema == "bigbio_kb": - features = schemas.kb_features - - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=features, - supervised_keys=None, - homepage=_HOMEPAGE, - license=str(_LICENSE), - citation=_CITATION, - ) - - def _split_generators(self, dl_manager): - """Returns SplitGenerators.""" - - my_urls = _URLs[self.config.schema] - - path_xml_train = dl_manager.download(my_urls["train"]) - path_xml_test = dl_manager.download(my_urls["test"]) - - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, - # These kwargs will be passed to _generate_examples - gen_kwargs={ - "filepath": path_xml_train, - "split": "train", - }, - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, - # These kwargs will be passed to _generate_examples - gen_kwargs={ - "filepath": path_xml_test, - "split": "test", - }, - ), - ] - - def _get_passages_and_entities(self, d) -> Tuple[List[Dict], List[List[Dict]]]: - - sentences: List[Dict] = [] - entities: List[List[Dict]] = [] - relations: List[List[Dict]] = [] - - text_total_length = 0 - - po_start = 0 - - # Get sentences of the document - for _, s in enumerate(d): - - # annotation used only for document indexing - if s.attrib["text"] is None or len(s.attrib["text"]) <= 0: - continue - - # annotation used only for document indexing - if len(s) <= 0: - continue - - text_total_length += len(s.attrib["text"]) + 1 - - po_end = po_start + len(s.attrib["text"]) - - start = po_start - - dp = { - "text": s.attrib["text"], - "type": "title" if ".s0" in s.attrib["id"] else "abstract", - "offsets": [(po_start, po_end)], - "offset": 0, # original offset - } - - po_start = po_end + 1 - - sentences.append(dp) - - pe = [] # entities - re = [] # relations - - # For each entity - for a in s: - - # If correspond to a entity - if a.tag == "entity": - - length = len(a.attrib["text"]) - - if a.attrib["text"] is None or length <= 0: - continue - - # no in-text annotation: only for document indexing - if a.attrib["type"] in ["MeSH_Indexing_Chemical", "OTHER"]: - continue - - startOffset, endOffset = a.attrib["charOffset"].split("-") - startOffset, endOffset = int(startOffset), int(endOffset) - - pe.append( - { - "id": a.attrib["id"], - "type": a.attrib["type"], - "text": (a.attrib["text"],), - "offsets": [(start + startOffset, start + endOffset + 1)], - "normalized": [{"db_name": "miRNA-corpus", "db_id": a.attrib["id"]}], - } - ) - - # If correspond to relation pair - elif a.tag == "pair": - - re.append( - { - "id": a.attrib["id"], - "type": a.attrib["type"], - "arg1_id": a.attrib["e1"], - "arg2_id": a.attrib["e2"], - "normalized": [], - } - ) - - entities.append(pe) - relations.append(re) - - return sentences, entities, relations - - def _generate_examples( - self, - filepath: str, - split: str, - ) -> Iterator[Tuple[int, Dict]]: - """Yields examples as (key, example) tuples.""" - - reader = ET.fromstring(open(str(filepath), "r").read()) - - if self.config.schema == "source": - - for uid, doc in enumerate(reader): - - sentences, sentences_entities, relations = self._get_passages_and_entities(doc) - - if len(sentences) < 1 or len(sentences_entities) < 1 or len(sentences_entities) != len(sentences): - continue - - for p, pe, re in zip(sentences, sentences_entities, relations): - - p.pop("offsets") # BioC has only start for passages offsets - - p["document_id"] = doc.attrib["id"] - p["entities"] = pe # BioC has per passage entities - - yield uid, {"passages": sentences} - - elif self.config.schema == "bigbio_kb": - - uid = 0 - - for idx, doc in enumerate(reader): - - sentences, sentences_entities, relations = self._get_passages_and_entities(doc) - - if len(sentences) < 1 or len(sentences_entities) < 1 or len(sentences_entities) != len(sentences): - continue - - # global id - uid += 1 - - # unpack per-sentence entities - entities = [e for pe in sentences_entities for e in pe] - - for p in sentences: - p.pop("offset") # drop original offset - p["text"] = (p["text"],) # text in sentence is Sequence - p["id"] = uid - uid += 1 - - for e in entities: - e["id"] = uid - uid += 1 - - # unpack per-sentence relations - relations = [r for re in relations for r in re] - - for r in relations: - r["id"] = uid - uid += 1 - - yield idx, { - "id": uid, - "document_id": doc.attrib["id"], - "passages": sentences, - "entities": entities, - "events": [], - "coreferences": [], - "relations": relations, - } +_CITATION = """\ +@Article{Bagewadi2014, +author={Bagewadi, Shweta +and Bobi{\'{c}}, Tamara +and Hofmann-Apitius, Martin +and Fluck, Juliane +and Klinger, Roman}, +title={Detecting miRNA Mentions and Relations in Biomedical Literature}, +journal={F1000Research}, +year={2014}, +month={Aug}, +day={28}, +publisher={F1000Research}, +volume={3}, +pages={205-205}, +keywords={MicroRNAs; corpus; prediction algorithms}, +abstract={ + INTRODUCTION: MicroRNAs (miRNAs) have demonstrated their potential as post-transcriptional + gene expression regulators, participating in a wide spectrum of regulatory events such as + apoptosis, differentiation, and stress response. Apart from the role of miRNAs in normal + physiology, their dysregulation is implicated in a vast array of diseases. Dissection of + miRNA-related associations are valuable for contemplating their mechanism in diseases, + leading to the discovery of novel miRNAs for disease prognosis, diagnosis, and therapy. + MOTIVATION: Apart from databases and prediction tools, miRNA-related information is largely + available as unstructured text. Manual retrieval of these associations can be labor-intensive + due to steadily growing number of publications. Additionally, most of the published miRNA + entity recognition methods are keyword based, further subjected to manual inspection for + retrieval of relations. Despite the fact that several databases host miRNA-associations + derived from text, lower sensitivity and lack of published details for miRNA entity + recognition and associated relations identification has motivated the need for developing + comprehensive methods that are freely available for the scientific community. Additionally, + the lack of a standard corpus for miRNA-relations has caused difficulty in evaluating the + available systems. We propose methods to automatically extract mentions of miRNAs, species, + genes/proteins, disease, and relations from scientific literature. Our generated corpora, + along with dictionaries, and miRNA regular expression are freely available for academic + purposes. To our knowledge, these resources are the most comprehensive developed so far. + RESULTS: The identification of specific miRNA mentions reaches a recall of 0.94 and + precision of 0.93. Extraction of miRNA-disease and miRNA-gene relations lead to an + F1 score of up to 0.76. A comparison of the information extracted by our approach to + the databases miR2Disease and miRSel for the extraction of Alzheimer's disease + related relations shows the capability of our proposed methods in identifying correct + relations with improved sensitivity. The published resources and described methods can + help the researchers for maximal retrieval of miRNA-relations and generation of + miRNA-regulatory networks. AVAILABILITY: The training and test corpora, annotation + guidelines, developed dictionaries, and supplementary files are available at + http://www.scai.fraunhofer.de/mirna-corpora.html. +}, +note={26535109[pmid]}, +note={PMC4602280[pmcid]}, +issn={2046-1402}, +url={https://pubmed.ncbi.nlm.nih.gov/26535109}, +language={eng} +} +""" + +_DATASETNAME = "mirna" + +_DESCRIPTION = """\ +The corpus consists of 301 Medline citations. The documents were screened for +mentions of miRNA in the abstract text. Gene, disease and miRNA entities were manually +annotated. The corpus comprises of two separate files, a train and a test set, coming +from 201 and 100 documents respectively. +""" + +_HOMEPAGE = "https://www.scai.fraunhofer.de/en/business-research-areas/bioinformatics/downloads/download-mirna-test-corpus.html" + +_LICENSE = Licenses.CC_BY_NC_3p0 + +_BASE = "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/miRNA/miRNA-" + +_URLs = { + "source": { + "train": _BASE + "Train-Corpus.xml", + "test": _BASE + "Test-Corpus.xml", + }, + "bigbio_kb": { + "train": _BASE + "Train-Corpus.xml", + "test": _BASE + "Test-Corpus.xml", + }, +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION, Tasks.NAMED_ENTITY_DISAMBIGUATION] +_SOURCE_VERSION = "1.0.0" +_BIGBIO_VERSION = "1.0.0" + + +class miRNADataset(datasets.GeneratorBasedBuilder): + """mirna""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="mirna_source", + version=SOURCE_VERSION, + description="mirna source schema", + schema="source", + subset_id="mirna", + ), + BigBioConfig( + name="mirna_bigbio_kb", + version=BIGBIO_VERSION, + description="mirna BigBio schema", + schema="bigbio_kb", + subset_id="mirna", + ), + ] + + DEFAULT_CONFIG_NAME = "mirna_source" + + def _info(self): + + if self.config.schema == "source": + + features = datasets.Features( + { + "passages": [ + { + "document_id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Value("string"), + "offset": datasets.Value("int32"), + "entities": [ + { + "id": datasets.Value("string"), + "offsets": [[datasets.Value("int32")]], + "text": [datasets.Value("string")], + "type": datasets.Value("string"), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + } + ] + } + ) + + elif self.config.schema == "bigbio_kb": + features = schemas.kb_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + supervised_keys=None, + homepage=_HOMEPAGE, + license=str(_LICENSE), + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + + my_urls = _URLs[self.config.schema] + + path_xml_train = dl_manager.download(my_urls["train"]) + path_xml_test = dl_manager.download(my_urls["test"]) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # These kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": path_xml_train, + "split": "train", + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + # These kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": path_xml_test, + "split": "test", + }, + ), + ] + + def _get_passages_and_entities(self, d) -> Tuple[List[Dict], List[List[Dict]]]: + + sentences: List[Dict] = [] + entities: List[List[Dict]] = [] + relations: List[List[Dict]] = [] + + text_total_length = 0 + + po_start = 0 + + # Get sentences of the document + for _, s in enumerate(d): + + # annotation used only for document indexing + if s.attrib["text"] is None or len(s.attrib["text"]) <= 0: + continue + + # annotation used only for document indexing + if len(s) <= 0: + continue + + text_total_length += len(s.attrib["text"]) + 1 + + po_end = po_start + len(s.attrib["text"]) + + start = po_start + + dp = { + "text": s.attrib["text"], + "type": "title" if ".s0" in s.attrib["id"] else "abstract", + "offsets": [(po_start, po_end)], + "offset": 0, # original offset + } + + po_start = po_end + 1 + + sentences.append(dp) + + pe = [] # entities + re = [] # relations + + # For each entity + for a in s: + + # If correspond to a entity + if a.tag == "entity": + + length = len(a.attrib["text"]) + + if a.attrib["text"] is None or length <= 0: + continue + + # no in-text annotation: only for document indexing + if a.attrib["type"] in ["MeSH_Indexing_Chemical", "OTHER"]: + continue + + startOffset, endOffset = a.attrib["charOffset"].split("-") + startOffset, endOffset = int(startOffset), int(endOffset) + + pe.append( + { + "id": a.attrib["id"], + "type": a.attrib["type"], + "text": (a.attrib["text"],), + "offsets": [(start + startOffset, start + endOffset + 1)], + "normalized": [ + {"db_name": "miRNA-corpus", "db_id": a.attrib["id"]} + ], + } + ) + + # If correspond to relation pair + elif a.tag == "pair": + + re.append( + { + "id": a.attrib["id"], + "type": a.attrib["type"], + "arg1_id": a.attrib["e1"], + "arg2_id": a.attrib["e2"], + "normalized": [], + } + ) + + entities.append(pe) + relations.append(re) + + return sentences, entities, relations + + def _generate_examples( + self, + filepath: str, + split: str, + ) -> Iterator[Tuple[int, Dict]]: + """Yields examples as (key, example) tuples.""" + + reader = ET.fromstring(open(str(filepath), "r").read()) + + if self.config.schema == "source": + + for uid, doc in enumerate(reader): + + ( + sentences, + sentences_entities, + relations, + ) = self._get_passages_and_entities(doc) + + if ( + len(sentences) < 1 + or len(sentences_entities) < 1 + or len(sentences_entities) != len(sentences) + ): + continue + + for p, pe, re in zip(sentences, sentences_entities, relations): + + p.pop("offsets") # BioC has only start for passages offsets + + p["document_id"] = doc.attrib["id"] + p["entities"] = pe # BioC has per passage entities + + yield uid, {"passages": sentences} + + elif self.config.schema == "bigbio_kb": + + uid = 0 + + for idx, doc in enumerate(reader): + + ( + sentences, + sentences_entities, + relations, + ) = self._get_passages_and_entities(doc) + + if ( + len(sentences) < 1 + or len(sentences_entities) < 1 + or len(sentences_entities) != len(sentences) + ): + continue + + # global id + uid += 1 + + # unpack per-sentence entities + entities = [e for pe in sentences_entities for e in pe] + + for p in sentences: + p.pop("offset") # drop original offset + p["text"] = (p["text"],) # text in sentence is Sequence + p["id"] = uid + uid += 1 + + for e in entities: + e["id"] = uid + uid += 1 + + # unpack per-sentence relations + relations = [r for re in relations for r in re] + + for r in relations: + r["id"] = uid + uid += 1 + + yield idx, { + "id": uid, + "document_id": doc.attrib["id"], + "passages": sentences, + "entities": entities, + "events": [], + "coreferences": [], + "relations": relations, + } From 52c561d3ee3170d49a7e867880a9883d24797c2a Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:29:25 +0200 Subject: [PATCH 08/20] new tags --- bigbio/utils/resources/tags.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bigbio/utils/resources/tags.json b/bigbio/utils/resources/tags.json index 2dcc382e1..51ed9f1b9 100644 --- a/bigbio/utils/resources/tags.json +++ b/bigbio/utils/resources/tags.json @@ -43,5 +43,7 @@ "ANONYMIZATION" : "Anonymizatio (De-identification)", "SENTIMENT_ANALYSIS" : "Sentiment analysis", "MIRNA" : "miRNA", - "ABBREVIATION" : "Abbreviation" + "ABBREVIATION" : "Abbreviation", + "FACT_CHECKING" : "Fact-checking", + "INTENT" : "Intent" } From 14fff7959cf7625abb58fe05f7517da17c5e6c3f Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:29:33 +0200 Subject: [PATCH 09/20] complete adding tags --- bigbio/biodatasets/ask_a_patient/ask_a_patient.py | 2 +- bigbio/biodatasets/pharmaconer/pharmaconer.py | 2 +- bigbio/biodatasets/pho_ner/pho_ner.py | 2 +- bigbio/biodatasets/pico_extraction/pico_extraction.py | 2 +- bigbio/biodatasets/progene/progene.py | 2 +- bigbio/biodatasets/psytar/psytar.py | 2 +- bigbio/biodatasets/pubhealth/pubhealth.py | 2 +- bigbio/biodatasets/pubmed_qa/pubmed_qa.py | 4 ++-- bigbio/biodatasets/pubtator_central/pubtator_central.py | 2 +- bigbio/biodatasets/quaero/quaero.py | 9 ++++++++- bigbio/biodatasets/scai_chemical/scai_chemical.py | 2 +- bigbio/biodatasets/scai_disease/scai_disease.py | 2 +- bigbio/biodatasets/scicite/scicite.py | 2 +- bigbio/biodatasets/scifact/scifact.py | 2 +- bigbio/biodatasets/sciq/sciq.py | 2 +- bigbio/biodatasets/seth_corpus/seth_corpus.py | 2 +- bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py | 2 +- .../swedish_medical_ner/swedish_medical_ner.py | 2 +- bigbio/biodatasets/thomas2011/thomas2011.py | 2 +- bigbio/biodatasets/tmvar_v1/tmvar_v1.py | 2 +- bigbio/biodatasets/tmvar_v2/tmvar_v2.py | 2 +- bigbio/biodatasets/tmvar_v3/tmvar_v3.py | 2 +- bigbio/biodatasets/twadrl/twadrl.py | 2 +- bigbio/biodatasets/umnsrs/umnsrs.py | 2 +- bigbio/biodatasets/verspoor_2013/verspoor_2013.py | 2 +- 25 files changed, 33 insertions(+), 26 deletions(-) diff --git a/bigbio/biodatasets/ask_a_patient/ask_a_patient.py b/bigbio/biodatasets/ask_a_patient/ask_a_patient.py index bd89c5026..53bc81a9c 100644 --- a/bigbio/biodatasets/ask_a_patient/ask_a_patient.py +++ b/bigbio/biodatasets/ask_a_patient/ask_a_patient.py @@ -26,7 +26,7 @@ _DATASETNAME = "ask_a_patient" -_TAGS = [Tags.SOCIAL_MEDIA, Tags.ADR] +_TAGS = [Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pharmaconer/pharmaconer.py b/bigbio/biodatasets/pharmaconer/pharmaconer.py index f20fd87f1..ac5aade0b 100644 --- a/bigbio/biodatasets/pharmaconer/pharmaconer.py +++ b/bigbio/biodatasets/pharmaconer/pharmaconer.py @@ -34,7 +34,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.GENE, Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pho_ner/pho_ner.py b/bigbio/biodatasets/pho_ner/pho_ner.py index 4ae3852d1..32e0e4e02 100644 --- a/bigbio/biodatasets/pho_ner/pho_ner.py +++ b/bigbio/biodatasets/pho_ner/pho_ner.py @@ -23,7 +23,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [] +_TAGS = [Tag.DISEASE, Tag.COVID] _LANGUAGES = [Lang.VI] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pico_extraction/pico_extraction.py b/bigbio/biodatasets/pico_extraction/pico_extraction.py index ab4c36f25..7fba82aba 100644 --- a/bigbio/biodatasets/pico_extraction/pico_extraction.py +++ b/bigbio/biodatasets/pico_extraction/pico_extraction.py @@ -30,7 +30,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.PICO] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/progene/progene.py b/bigbio/biodatasets/progene/progene.py index f1ce6223a..3456fdd26 100644 --- a/bigbio/biodatasets/progene/progene.py +++ b/bigbio/biodatasets/progene/progene.py @@ -25,7 +25,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/psytar/psytar.py b/bigbio/biodatasets/psytar/psytar.py index e0931739a..fef90eed6 100644 --- a/bigbio/biodatasets/psytar/psytar.py +++ b/bigbio/biodatasets/psytar/psytar.py @@ -54,7 +54,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DRUG, Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/pubhealth/pubhealth.py b/bigbio/biodatasets/pubhealth/pubhealth.py index 5320c16e6..6d64352e4 100644 --- a/bigbio/biodatasets/pubhealth/pubhealth.py +++ b/bigbio/biodatasets/pubhealth/pubhealth.py @@ -31,7 +31,7 @@ logger = datasets.utils.logging.get_logger(__name__) -_TAGS = [] +_TAGS = [Tags.FACT_CHECKING] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/pubmed_qa/pubmed_qa.py b/bigbio/biodatasets/pubmed_qa/pubmed_qa.py index 4bdf15062..7203b608a 100644 --- a/bigbio/biodatasets/pubmed_qa/pubmed_qa.py +++ b/bigbio/biodatasets/pubmed_qa/pubmed_qa.py @@ -27,10 +27,10 @@ import bigbio.utils.parsing as parsing import bigbio.utils.schemas as schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import BigBioValues, Lang, Tasks +from bigbio.utils.constants import BigBioValues, Lang, Tasks, Tags from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.YESNO, Tags.ABSTRACTIVE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pubtator_central/pubtator_central.py b/bigbio/biodatasets/pubtator_central/pubtator_central.py index 50048a96f..8ba512d3f 100644 --- a/bigbio/biodatasets/pubtator_central/pubtator_central.py +++ b/bigbio/biodatasets/pubtator_central/pubtator_central.py @@ -51,7 +51,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.CELL, Tags.SPECIES, Tags.VARIANT, Tags.CHEMICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/quaero/quaero.py b/bigbio/biodatasets/quaero/quaero.py index 29558a115..2d949b028 100644 --- a/bigbio/biodatasets/quaero/quaero.py +++ b/bigbio/biodatasets/quaero/quaero.py @@ -9,7 +9,14 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [] +_TAGS = [ + Tags.CHEMICAL, + Tags.ANATOMY, + Tags.DRUG, + Tags.SPECIES, + Tags.PROCEDURE, + Tags.DISEASE, +] _LANGUAGES = [Lang.FR] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/scai_chemical/scai_chemical.py b/bigbio/biodatasets/scai_chemical/scai_chemical.py index e3c4ef800..2935b9a04 100644 --- a/bigbio/biodatasets/scai_chemical/scai_chemical.py +++ b/bigbio/biodatasets/scai_chemical/scai_chemical.py @@ -31,7 +31,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CHEMICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/scai_disease/scai_disease.py b/bigbio/biodatasets/scai_disease/scai_disease.py index 4b7905d9b..d4bdb3f9c 100644 --- a/bigbio/biodatasets/scai_disease/scai_disease.py +++ b/bigbio/biodatasets/scai_disease/scai_disease.py @@ -33,7 +33,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/scicite/scicite.py b/bigbio/biodatasets/scicite/scicite.py index 3a0f3284b..0626f7b70 100644 --- a/bigbio/biodatasets/scicite/scicite.py +++ b/bigbio/biodatasets/scicite/scicite.py @@ -40,7 +40,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.INTENT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/scifact/scifact.py b/bigbio/biodatasets/scifact/scifact.py index 14a48e21a..c537fcfba 100644 --- a/bigbio/biodatasets/scifact/scifact.py +++ b/bigbio/biodatasets/scifact/scifact.py @@ -25,7 +25,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.FACT_CHECKING] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/sciq/sciq.py b/bigbio/biodatasets/sciq/sciq.py index 54f44e622..eee43620f 100644 --- a/bigbio/biodatasets/sciq/sciq.py +++ b/bigbio/biodatasets/sciq/sciq.py @@ -25,7 +25,7 @@ _DATASETNAME = "sciq" -_TAGS = [] +_TAGS = [Tags.MULTIPLE_CHOICE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/seth_corpus/seth_corpus.py b/bigbio/biodatasets/seth_corpus/seth_corpus.py index 82b84b534..fbf5c754c 100644 --- a/bigbio/biodatasets/seth_corpus/seth_corpus.py +++ b/bigbio/biodatasets/seth_corpus/seth_corpus.py @@ -31,7 +31,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py b/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py index a6b16123f..3936b2305 100644 --- a/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py +++ b/bigbio/biodatasets/spl_adr_200db/spl_adr_200db.py @@ -67,7 +67,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.ADR, Tags.DRUG, Tags.NEGATION] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py b/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py index 1ec26aca1..4ece98c1a 100644 --- a/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py +++ b/bigbio/biodatasets/swedish_medical_ner/swedish_medical_ner.py @@ -43,7 +43,7 @@ _DATASETNAME = "swedish_medical_ner" -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.DRUG, Tags.ANATOMY] _LANGUAGES = [Lang.SV] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/thomas2011/thomas2011.py b/bigbio/biodatasets/thomas2011/thomas2011.py index d2747c4a8..d55c650a3 100644 --- a/bigbio/biodatasets/thomas2011/thomas2011.py +++ b/bigbio/biodatasets/thomas2011/thomas2011.py @@ -53,7 +53,7 @@ from bigbio.utils.license import CustomLicense # TODO: Add BibTeX citation -_TAGS = [] +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/tmvar_v1/tmvar_v1.py b/bigbio/biodatasets/tmvar_v1/tmvar_v1.py index f6cb22e59..93c910e86 100644 --- a/bigbio/biodatasets/tmvar_v1/tmvar_v1.py +++ b/bigbio/biodatasets/tmvar_v1/tmvar_v1.py @@ -26,7 +26,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/tmvar_v2/tmvar_v2.py b/bigbio/biodatasets/tmvar_v2/tmvar_v2.py index b522524b6..a3518bbf1 100644 --- a/bigbio/biodatasets/tmvar_v2/tmvar_v2.py +++ b/bigbio/biodatasets/tmvar_v2/tmvar_v2.py @@ -26,7 +26,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.VARIANT] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/tmvar_v3/tmvar_v3.py b/bigbio/biodatasets/tmvar_v3/tmvar_v3.py index 1e2bb9dd4..197a33fc9 100644 --- a/bigbio/biodatasets/tmvar_v3/tmvar_v3.py +++ b/bigbio/biodatasets/tmvar_v3/tmvar_v3.py @@ -44,7 +44,7 @@ copyright = {Creative Commons Attribution 4.0 International} } """ -_TAGS = [] +_TAGS = [Tags.VARIANT, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/twadrl/twadrl.py b/bigbio/biodatasets/twadrl/twadrl.py index abfb83a91..1735b2ec1 100644 --- a/bigbio/biodatasets/twadrl/twadrl.py +++ b/bigbio/biodatasets/twadrl/twadrl.py @@ -26,7 +26,7 @@ _DATASETNAME = "twadrl" -_TAGS = [] +_TAGS = [Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/umnsrs/umnsrs.py b/bigbio/biodatasets/umnsrs/umnsrs.py index 8bb39e554..07f603e87 100644 --- a/bigbio/biodatasets/umnsrs/umnsrs.py +++ b/bigbio/biodatasets/umnsrs/umnsrs.py @@ -32,7 +32,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CONCEPT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/verspoor_2013/verspoor_2013.py b/bigbio/biodatasets/verspoor_2013/verspoor_2013.py index 58f613132..2464a95f8 100644 --- a/bigbio/biodatasets/verspoor_2013/verspoor_2013.py +++ b/bigbio/biodatasets/verspoor_2013/verspoor_2013.py @@ -35,7 +35,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.VARIANT, Tags.CANCER] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False From 71ceed5f1fddd71d5895eb9499983ade88320ab8 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:32:13 +0200 Subject: [PATCH 10/20] ORGANISM is SPECIES, SOCIAL_MEDIA belongs to `source` not `subtask` --- bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py | 2 +- bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py | 2 +- bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py | 2 +- bigbio/biodatasets/cadec/cadec.py | 2 +- bigbio/biodatasets/cord_ner/cord_ner.py | 2 +- bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py b/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py index c5e0734d5..775c56fc0 100644 --- a/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py +++ b/bigbio/biodatasets/bionlp_st_2011_id/bionlp_st_2011_id.py @@ -31,7 +31,7 @@ Tags.DISEASE, Tags.GENE, Tags.CHEMICAL, - Tags.ORGANISM, + Tags.SPECIES, Tags.SPECULATION, Tags.NEGATION, ] diff --git a/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py b/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py index 1241b22c5..bc61c02eb 100644 --- a/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py +++ b/bigbio/biodatasets/bionlp_st_2013_gro/bionlp_st_2013_gro.py @@ -28,7 +28,7 @@ _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" -_TAGS = [Tags.GENE, Tags.ORGANISM, Tags.CELL, Tags.TISSUE] +_TAGS = [Tags.GENE, Tags.SPECIES, Tags.CELL, Tags.TISSUE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py b/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py index 8d464b851..f399df666 100644 --- a/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py +++ b/bigbio/biodatasets/bionlp_st_2019_bb/bionlp_st_2019_bb.py @@ -27,7 +27,7 @@ _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" -_TAGS = [Tags.ORGANISM] +_TAGS = [Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/cadec/cadec.py b/bigbio/biodatasets/cadec/cadec.py index c604c0920..f9c604467 100644 --- a/bigbio/biodatasets/cadec/cadec.py +++ b/bigbio/biodatasets/cadec/cadec.py @@ -38,7 +38,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [Tags.SOCIAL_MEDIA, Tags.DISEASE, Tags.ADR, Tags.DRUG] +_TAGS = [Tags.DISEASE, Tags.ADR, Tags.DRUG] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/cord_ner/cord_ner.py b/bigbio/biodatasets/cord_ner/cord_ner.py index 5457155d7..f29c298ea 100644 --- a/bigbio/biodatasets/cord_ner/cord_ner.py +++ b/bigbio/biodatasets/cord_ner/cord_ner.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [Tags.GENE, Tags.DISEASE, Tags.CHEMICAL, Tags.COVID, Tags.ORGANISM] +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.CHEMICAL, Tags.COVID, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py b/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py index 26e972f20..35e93d536 100644 --- a/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py +++ b/bigbio/biodatasets/ntcir_13_medweb/ntcir_13_medweb.py @@ -66,7 +66,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.DISEASE, Tags.SOCIAL_MEDIA, Tags.SENTIMENT_ANALYSIS] +_TAGS = [Tags.DISEASE, Tags.SENTIMENT_ANALYSIS] _LANGUAGES = [Lang.EN, Lang.ZH, Lang.JA] _PUBMED = False _LOCAL = True From 1b0b89eae1c383cdfb56ab2bfe06fd4f1d5d1f2b Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:32:37 +0200 Subject: [PATCH 11/20] rm ORGANISM --- bigbio/utils/resources/tags.json | 1 - 1 file changed, 1 deletion(-) diff --git a/bigbio/utils/resources/tags.json b/bigbio/utils/resources/tags.json index 51ed9f1b9..46e624683 100644 --- a/bigbio/utils/resources/tags.json +++ b/bigbio/utils/resources/tags.json @@ -1,7 +1,6 @@ { "SOCIAL_MEDIA" : "Social media", "ANATOMY" : "Anatomy", - "ORGANISM" : "Organism", "ORGAN" : "Organ", "VARIANT" : "Variant/Mutation", "TISSUE" : "Tissue", From f5cc0525c39b0d76cfe766f2ac959548767891fa Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:53:17 +0200 Subject: [PATCH 12/20] add DIAGNOSIS tag --- bigbio/biodatasets/chia/chia.py | 2 +- bigbio/biodatasets/codiesp/codiesp.py | 2 +- bigbio/biodatasets/ctebmsp/ctebmsp.py | 2 +- bigbio/biodatasets/essai/essai.py | 2 +- bigbio/biodatasets/evidence_inference/evidence_inference.py | 2 +- bigbio/biodatasets/mantra_gsc/mantra_gsc.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bigbio/biodatasets/chia/chia.py b/bigbio/biodatasets/chia/chia.py index da93b98d4..b837bb916 100644 --- a/bigbio/biodatasets/chia/chia.py +++ b/bigbio/biodatasets/chia/chia.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.DISEASE, Tags.DRUG, Tags.PROCEDURE] +_TAGS = [Tags.DISEASE, Tags.DRUG, Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/codiesp/codiesp.py b/bigbio/biodatasets/codiesp/codiesp.py index 65671fcd8..b9c551e0d 100644 --- a/bigbio/biodatasets/codiesp/codiesp.py +++ b/bigbio/biodatasets/codiesp/codiesp.py @@ -38,7 +38,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.DISEASE, Tags.DOCUMENT_INDEXING, Tags.PROCEDURE] +_TAGS = [Tags.DISEASE, Tags.DOCUMENT_INDEXING, Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.ES] _PUBMED = False _LOCAL = False diff --git a/bigbio/biodatasets/ctebmsp/ctebmsp.py b/bigbio/biodatasets/ctebmsp/ctebmsp.py index f5a3fc2b8..0831f48b7 100644 --- a/bigbio/biodatasets/ctebmsp/ctebmsp.py +++ b/bigbio/biodatasets/ctebmsp/ctebmsp.py @@ -34,7 +34,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.ANATOMY, Tags.CHEMICAL, Tags.DISEASE, Tags.PROCEDURE] +_TAGS = [Tags.ANATOMY, Tags.CHEMICAL, Tags.DISEASE, Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.ES] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/essai/essai.py b/bigbio/biodatasets/essai/essai.py index aab446381..4cff31a13 100644 --- a/bigbio/biodatasets/essai/essai.py +++ b/bigbio/biodatasets/essai/essai.py @@ -9,7 +9,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.DISEASE, Tags.PROCEDURE] +_TAGS = [Tags.NEGATION, Tags.SPECULATION, Tags.DISEASE, Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.FR] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/evidence_inference/evidence_inference.py b/bigbio/biodatasets/evidence_inference/evidence_inference.py index d17594ca8..cade748f8 100644 --- a/bigbio/biodatasets/evidence_inference/evidence_inference.py +++ b/bigbio/biodatasets/evidence_inference/evidence_inference.py @@ -35,7 +35,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.PROCEDURE] +_TAGS = [Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/mantra_gsc/mantra_gsc.py b/bigbio/biodatasets/mantra_gsc/mantra_gsc.py index cf572db03..8eb1891c1 100644 --- a/bigbio/biodatasets/mantra_gsc/mantra_gsc.py +++ b/bigbio/biodatasets/mantra_gsc/mantra_gsc.py @@ -25,7 +25,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.GENE, Tags.DISEASE, Tags.PROCEDURE] +_TAGS = [Tags.GENE, Tags.DISEASE, Tags.PROCEDURE, Tags.DIAGNOSIS] _LANGUAGES = [Lang.EN, Lang.FR, Lang.DE, Lang.NL, Lang.ES] _PUBMED = True _LOCAL = False From f4b528e02486f06647640aba776f657523a325df Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:53:34 +0200 Subject: [PATCH 13/20] add n2c2 datasets --- bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py | 2 +- bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py | 2 +- bigbio/biodatasets/n2c2_2008/n2c2_2008.py | 2 +- bigbio/biodatasets/n2c2_2009/n2c2_2009.py | 2 +- bigbio/biodatasets/n2c2_2010/n2c2_2010.py | 2 +- bigbio/biodatasets/n2c2_2011/n2c2_2011.py | 2 +- bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py | 2 +- bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py b/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py index 9144f25f0..cde53908b 100644 --- a/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py +++ b/bigbio/biodatasets/n2c2_2006_deid/n2c2_2006_deid.py @@ -71,7 +71,7 @@ _DATASETNAME = "n2c2_2006" # https://academic.oup.com/jamia/article/14/5/550/720189 -_TAGS = [] +_TAGS = [Tags.ANONYMIZATION] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py b/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py index 6e0fc9209..9d0b1a99b 100644 --- a/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py +++ b/bigbio/biodatasets/n2c2_2006_smokers/n2c2_2006_smokers.py @@ -69,7 +69,7 @@ _DATASETNAME = "n2c2_2006" # https://academic.oup.com/jamia/article/15/1/14/779738 -_TAGS = [] +_TAGS = [Tags.DIAGNOSIS] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2008/n2c2_2008.py b/bigbio/biodatasets/n2c2_2008/n2c2_2008.py index 4b3054ac1..bb2f37e3b 100644 --- a/bigbio/biodatasets/n2c2_2008/n2c2_2008.py +++ b/bigbio/biodatasets/n2c2_2008/n2c2_2008.py @@ -77,7 +77,7 @@ _DATASETNAME = "n2c2_2008" # https://academic.oup.com/jamia/article/16/4/561/766997 -_TAGS = [] +_TAGS = [Tags.DIAGNOSIS, Tags.DISEASE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2009/n2c2_2009.py b/bigbio/biodatasets/n2c2_2009/n2c2_2009.py index 88f1e60c5..742ce0955 100644 --- a/bigbio/biodatasets/n2c2_2009/n2c2_2009.py +++ b/bigbio/biodatasets/n2c2_2009/n2c2_2009.py @@ -60,7 +60,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.PROCEDURE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2010/n2c2_2010.py b/bigbio/biodatasets/n2c2_2010/n2c2_2010.py index 549ac121a..3b095e3f7 100644 --- a/bigbio/biodatasets/n2c2_2010/n2c2_2010.py +++ b/bigbio/biodatasets/n2c2_2010/n2c2_2010.py @@ -55,7 +55,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.DIAGNOSIS, Tags.NEGATION] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2011/n2c2_2011.py b/bigbio/biodatasets/n2c2_2011/n2c2_2011.py index 67fc5e684..478fba48d 100644 --- a/bigbio/biodatasets/n2c2_2011/n2c2_2011.py +++ b/bigbio/biodatasets/n2c2_2011/n2c2_2011.py @@ -78,7 +78,7 @@ _DATASETNAME = "n2c2_2011" # https://academic.oup.com/jamia/article/19/5/786/716138 -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.TREATMENT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py b/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py index 59411a293..0c18374c7 100644 --- a/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py +++ b/bigbio/biodatasets/n2c2_2018_track1/n2c2_2018_track1.py @@ -46,7 +46,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py b/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py index 13ddc19b1..9862ea227 100644 --- a/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py +++ b/bigbio/biodatasets/n2c2_2018_track2/n2c2_2018_track2.py @@ -49,7 +49,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DRUG, Tags.ADR] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True From fbfdc7063ce71db88be2e321f1b8366692456ff8 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:53:44 +0200 Subject: [PATCH 14/20] add diagnosis tag --- bigbio/biodatasets/quaero/quaero.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigbio/biodatasets/quaero/quaero.py b/bigbio/biodatasets/quaero/quaero.py index 2d949b028..4edc5c45c 100644 --- a/bigbio/biodatasets/quaero/quaero.py +++ b/bigbio/biodatasets/quaero/quaero.py @@ -16,6 +16,7 @@ Tags.SPECIES, Tags.PROCEDURE, Tags.DISEASE, + Tags.DIAGNOSIS, ] _LANGUAGES = [Lang.FR] _PUBMED = True From 7f96f08e3164ea6129f496da89226cf6c8004e8d Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:53:50 +0200 Subject: [PATCH 15/20] update tags --- bigbio/utils/resources/tags.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigbio/utils/resources/tags.json b/bigbio/utils/resources/tags.json index 46e624683..f58f27794 100644 --- a/bigbio/utils/resources/tags.json +++ b/bigbio/utils/resources/tags.json @@ -44,5 +44,6 @@ "MIRNA" : "miRNA", "ABBREVIATION" : "Abbreviation", "FACT_CHECKING" : "Fact-checking", - "INTENT" : "Intent" + "INTENT" : "Intent", + "DIAGNOSIS" : "DIAGNOSIS" } From 4109ebfcb05d3e96e119ff33b3e0e7f4c37fd1e6 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:58:29 +0200 Subject: [PATCH 16/20] format --- bigbio/biodatasets/biosses/biosses.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigbio/biodatasets/biosses/biosses.py b/bigbio/biodatasets/biosses/biosses.py index a55a313c2..7f7e72eec 100644 --- a/bigbio/biodatasets/biosses/biosses.py +++ b/bigbio/biodatasets/biosses/biosses.py @@ -33,7 +33,7 @@ _DATASETNAME = "biosses" -_TAGS = [] +_TAGS = [Tags.SENTENCE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False From a7728abf2df44dd8e18c50e9ee40fe3426665cc0 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 15:58:34 +0200 Subject: [PATCH 17/20] add tags to examples --- examples/bc5cdr.py | 2 +- examples/bioasq_task_b.py | 9 +++++---- examples/biosses.py | 2 +- examples/chemprot.py | 2 +- examples/hallmarks_of_cancer.py | 2 +- examples/mlee.py | 2 +- examples/muchmore.py | 2 +- examples/n2c2_2011.py | 2 +- examples/nlmchem.py | 2 +- 9 files changed, 13 insertions(+), 12 deletions(-) diff --git a/examples/bc5cdr.py b/examples/bc5cdr.py index ee325c162..0c94c3add 100644 --- a/examples/bc5cdr.py +++ b/examples/bc5cdr.py @@ -35,7 +35,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [Tags.DISEASE, Tags.CHEMICAL, Tags.CHEMICAL_DISEASE_RELATION, Tags.MESH] +_TAGS = [Tags.DISEASE, Tags.CHEMICAL] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/bioasq_task_b.py b/examples/bioasq_task_b.py index 9026918f6..8be34588c 100644 --- a/examples/bioasq_task_b.py +++ b/examples/bioasq_task_b.py @@ -36,10 +36,11 @@ from bigbio.utils.license import Licenses _TAGS = [ - Tags.QA_YESNO - Tags.QA_FACTOID, - Tags.QA_LIST, - Tags.QA_SUMMARY, + Tags.YESNO, + Tags.FACTOID, + Tags.FACTOID_LIST, + Tags.ABSTRACTIVE, + Tags.EXTRACTIVE, ] _LANGUAGES = [Lang.EN] _PUBMED = True diff --git a/examples/biosses.py b/examples/biosses.py index 80aa75b36..913239499 100644 --- a/examples/biosses.py +++ b/examples/biosses.py @@ -33,7 +33,7 @@ _DATASETNAME = "biosses" -_TAGS = [] +_TAGS = [Tags.SENTENCE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = False diff --git a/examples/chemprot.py b/examples/chemprot.py index c29b362ae..3a43c3197 100644 --- a/examples/chemprot.py +++ b/examples/chemprot.py @@ -28,7 +28,7 @@ from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.GENE] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/hallmarks_of_cancer.py b/examples/hallmarks_of_cancer.py index 50600def9..09f377944 100644 --- a/examples/hallmarks_of_cancer.py +++ b/examples/hallmarks_of_cancer.py @@ -22,7 +22,7 @@ from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.CANCER] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/mlee.py b/examples/mlee.py index e0330d53a..b98bf327c 100644 --- a/examples/mlee.py +++ b/examples/mlee.py @@ -32,7 +32,7 @@ _SOURCE_VIEW_NAME = "source" _UNIFIED_VIEW_NAME = "bigbio" -_TAGS = [] +_TAGS = [Tags.GENE, Tags.DRUG, Tags.CELL, Tags.ORGAN, Tags.TISSUE, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/muchmore.py b/examples/muchmore.py index 9afb2982d..6ce74b9f8 100644 --- a/examples/muchmore.py +++ b/examples/muchmore.py @@ -76,7 +76,7 @@ from bigbio.utils.constants import Lang, Tasks, Tags from bigbio.utils.license import Licenses -_TAGS = [] +_TAGS = [Tags.POS] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/examples/n2c2_2011.py b/examples/n2c2_2011.py index d1dd79f7f..2495432fc 100644 --- a/examples/n2c2_2011.py +++ b/examples/n2c2_2011.py @@ -78,7 +78,7 @@ _DATASETNAME = "n2c2_2011" # https://academic.oup.com/jamia/article/19/5/786/716138 -_TAGS = [] +_TAGS = [Tags.DISEASE, Tags.TREATMENT] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/examples/nlmchem.py b/examples/nlmchem.py index 885234462..6b9438592 100644 --- a/examples/nlmchem.py +++ b/examples/nlmchem.py @@ -26,7 +26,7 @@ from bigbio.utils.license import Licenses from bigbio.utils.parsing import get_texts_and_offsets_from_bioc_ann -_TAGS = [] +_TAGS = [Tags.CHEMICAL, Tags.DOCUMENT_INDEXING] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False From dda930f9b0cdd73e4bf70f93b129a604a52db316 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 18:05:37 +0200 Subject: [PATCH 18/20] fix missing/errors --- bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py | 4 ++-- bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py | 8 +++++++- bigbio/biodatasets/lll/lll.py | 2 +- bigbio/biodatasets/medmentions/medmentions.py | 2 +- bigbio/biodatasets/n2c2_2011/n2c2_2011.py | 2 +- bigbio/biodatasets/nagel/nagel.py | 2 +- bigbio/biodatasets/pho_ner/pho_ner.py | 2 +- bigbio/utils/resources/tags.json | 2 +- 8 files changed, 15 insertions(+), 9 deletions(-) diff --git a/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py b/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py index a72d0386d..6365cd7e2 100644 --- a/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py +++ b/bigbio/biodatasets/bionlp_st_2013_cg/bionlp_st_2013_cg.py @@ -30,11 +30,11 @@ Tags.DISEASE, Tags.CANCER, Tags.TISSUE, - Tags.ORGANISM, + Tags.SPECIES, Tags.CELL, Tags.GENE, Tags.CHEMICAL, - Tags.PATHWAY, + Tags.PATHWAY_CURATION, ] _LANGUAGES = [Lang.EN] _PUBMED = True diff --git a/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py b/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py index f685ff3ea..cee27dfac 100644 --- a/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py +++ b/bigbio/biodatasets/bionlp_st_2013_pc/bionlp_st_2013_pc.py @@ -26,7 +26,13 @@ _DATASETNAME = "bionlp_st_2013_pc" _UNIFIED_VIEW_NAME = "bigbio" -_TAGS = [Tags.GENE, Tags.CHEMICAL, Tags.PATHWAY, Tags.NEGATION, Tags.SPECULATION] +_TAGS = [ + Tags.GENE, + Tags.CHEMICAL, + Tags.PATHWAY_CURATION, + Tags.NEGATION, + Tags.SPECULATION, +] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/lll/lll.py b/bigbio/biodatasets/lll/lll.py index 560185a5f..6dfe9914e 100644 --- a/bigbio/biodatasets/lll/lll.py +++ b/bigbio/biodatasets/lll/lll.py @@ -36,7 +36,7 @@ from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig -from bigbio.utils.constants import BigBioValues, Lang, Tasks +from bigbio.utils.constants import BigBioValues, Lang, Tasks, Tags from bigbio.utils.license import Licenses _TAGS = [Tags.GENE] diff --git a/bigbio/biodatasets/medmentions/medmentions.py b/bigbio/biodatasets/medmentions/medmentions.py index 9c9746635..633b86dd8 100644 --- a/bigbio/biodatasets/medmentions/medmentions.py +++ b/bigbio/biodatasets/medmentions/medmentions.py @@ -46,7 +46,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import Licenses -_TAGS = [Tags.DISEASE, Tags.CHEMICAL, Tags.ORGANISM] +_TAGS = [Tags.DISEASE, Tags.CHEMICAL, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/n2c2_2011/n2c2_2011.py b/bigbio/biodatasets/n2c2_2011/n2c2_2011.py index 478fba48d..7ab93a594 100644 --- a/bigbio/biodatasets/n2c2_2011/n2c2_2011.py +++ b/bigbio/biodatasets/n2c2_2011/n2c2_2011.py @@ -78,7 +78,7 @@ _DATASETNAME = "n2c2_2011" # https://academic.oup.com/jamia/article/19/5/786/716138 -_TAGS = [Tags.DISEASE, Tags.TREATMENT] +_TAGS = [Tags.DISEASE, Tags.PROCEDURE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True diff --git a/bigbio/biodatasets/nagel/nagel.py b/bigbio/biodatasets/nagel/nagel.py index 0f5990ff7..c80f80dbf 100644 --- a/bigbio/biodatasets/nagel/nagel.py +++ b/bigbio/biodatasets/nagel/nagel.py @@ -26,7 +26,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [Tags.MUTATION, Tags.GENE, Tags.SPECIES] +_TAGS = [Tags.VARIANT, Tags.GENE, Tags.SPECIES] _LANGUAGES = [Lang.EN] _PUBMED = True _LOCAL = False diff --git a/bigbio/biodatasets/pho_ner/pho_ner.py b/bigbio/biodatasets/pho_ner/pho_ner.py index 32e0e4e02..821cd0d01 100644 --- a/bigbio/biodatasets/pho_ner/pho_ner.py +++ b/bigbio/biodatasets/pho_ner/pho_ner.py @@ -23,7 +23,7 @@ from bigbio.utils.constants import Lang, Tags, Tasks from bigbio.utils.license import CustomLicense -_TAGS = [Tag.DISEASE, Tag.COVID] +_TAGS = [Tags.DISEASE, Tags.COVID] _LANGUAGES = [Lang.VI] _PUBMED = False _LOCAL = False diff --git a/bigbio/utils/resources/tags.json b/bigbio/utils/resources/tags.json index f58f27794..dc3f93eb4 100644 --- a/bigbio/utils/resources/tags.json +++ b/bigbio/utils/resources/tags.json @@ -16,7 +16,7 @@ "HOW" : "`How` question", "WHY" : "`Why` question", "FACTOID" : "QA with factoid answer", - "FACTOIND_LIST": "QA with list of factoid answer", + "FACTOID_LIST": "QA with list of factoid answer", "ABSTRACTIVE" : "Abstractive summary/answer", "EXTRACTIVE" : "Extractive summary/answer", "CLOZE_TEST" : "Cloze test", From 3493d0f5a19cff5f4174a39a6590b09eb896ae58 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 18:05:49 +0200 Subject: [PATCH 19/20] treatment is procedure --- examples/n2c2_2011.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/n2c2_2011.py b/examples/n2c2_2011.py index 2495432fc..ca6f47e10 100644 --- a/examples/n2c2_2011.py +++ b/examples/n2c2_2011.py @@ -78,7 +78,7 @@ _DATASETNAME = "n2c2_2011" # https://academic.oup.com/jamia/article/19/5/786/716138 -_TAGS = [Tags.DISEASE, Tags.TREATMENT] +_TAGS = [Tags.DISEASE, Tags.PROCEDURE] _LANGUAGES = [Lang.EN] _PUBMED = False _LOCAL = True From 44bce0ced4836f5ce6b848aba02f04c9e48ae465 Mon Sep 17 00:00:00 2001 From: "sgarda.wbi" Date: Wed, 8 Jun 2022 18:06:06 +0200 Subject: [PATCH 20/20] add script to gather (fine-grained) tasks counts --- scripts/gather_dataset_tasks.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 scripts/gather_dataset_tasks.py diff --git a/scripts/gather_dataset_tasks.py b/scripts/gather_dataset_tasks.py new file mode 100644 index 000000000..7523e8f45 --- /dev/null +++ b/scripts/gather_dataset_tasks.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Generate counts of tasks and fine-grained taks +""" + +from bigbio.dataloader import BigBioConfigHelpers + + +def main(): + """ + Gather counts on tasks and fine-grained tasks + """ + + configs = BigBioConfigHelpers() + + dataset_task = set() + + for conf in configs: + for task in conf.tasks: + dataset_task.add(conf.dataset_name, str(task)) + + print(dataset_task) + + +if __name__ == "__main__": + main()