arrow-left

All pages
gitbookPowered by GitBook
1 of 4

Loading...

Loading...

Loading...

Loading...

Database Schema

hashtag
api_key

Column
Type
Nullable
Default

id

hashtag
Constraints and indexes

  • api_key_pkey PRIMARY KEY, btree (id)

  • api_key_user_id_key UNIQUE CONSTRAINT, btree (user_id)


hashtag
batch_search

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • batch_search_pkey PRIMARY KEY, btree (uuid)

  • batch_search_date btree (batch_date)

  • batch_search_nb_queries btree (nb_queries)

hashtag
Referenced by

  • batch_search_pkey PRIMARY KEY, btree (uuid)

  • batch_search_date btree (batch_date)

  • batch_search_nb_queries btree (nb_queries)


hashtag
batch_search_project

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • batch_search_project_unique UNIQUE, btree (search_uuid, prj_id)

  • batch_search_project_batch_search_uuid_fk FOREIGN KEY (search_uuid) REFERENCES batch_search(uuid)


hashtag
batch_search_query

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • batch_search_query_search_id btree (search_uuid)

  • idx_query_result_batch_unique UNIQUE, btree (search_uuid, query)


hashtag
batch_search_result

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • batch_search_result_prj_id btree (prj_id)

  • batch_search_result_query btree (query)

  • batch_search_result_uuid btree (search_uuid)


hashtag
document

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • document_pkey PRIMARY KEY, btree (id)

  • document_parent_id btree (parent_id)

  • document_status btree (status)


hashtag
document_tag

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • document_tag_doc_id btree (doc_id)

  • document_tag_label btree (label)

  • document_tag_project_id btree (prj_id)


hashtag
document_user_recommendation

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • document_user_mark_read_doc_id btree (doc_id)

  • document_user_mark_read_project_id btree (prj_id)

  • document_user_mark_read_user_id btree (user_id)


hashtag
document_user_star

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • document_user_star_doc_id btree (doc_id)

  • document_user_star_project_id btree (prj_id)

  • document_user_star_user_id btree (user_id)


hashtag
named_entity

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • named_entity_pkey PRIMARY KEY, btree (id)

  • named_entity_doc_id btree (doc_id)


hashtag
note

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • idx_unique_note_path_project UNIQUE, btree (project_id, path)

  • note_project btree (project_id)


hashtag
project

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • project_pkey PRIMARY KEY, btree (id)


hashtag
task

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • task_pkey PRIMARY KEY, btree (id)

  • task_created_at btree (created_at)

  • task_group btree (group_id)


hashtag
user_history

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • user_history_pkey PRIMARY KEY, btree (id)

  • idx_user_history_unique UNIQUE, btree (user_id, uri)

  • user_history_creation_date btree (creation_date)

hashtag
Referenced by

  • user_history_pkey PRIMARY KEY, btree (id)

  • idx_user_history_unique UNIQUE, btree (user_id, uri)

  • user_history_creation_date btree (creation_date)


hashtag
user_history_project

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • user_history_project_unique UNIQUE, btree (user_history_id, prj_id)

  • user_history_project_user_history_id_fk FOREIGN KEY (user_history_id) REFERENCES user_history(id)


hashtag
user_inventory

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • user_inventory_pkey PRIMARY KEY, btree (id)


hashtag
user_policy

Column
Type
Nullable
Default

hashtag
Constraints and indexes

  • idx_user_policy_unique UNIQUE, btree (user_id, prj_id)


user_id

character varying(96)

not null

batch_date

timestamp without time zone

not null

state

character varying(8)

not null

published

integer

not null

0

phrase_matches

integer

not null

0

fuzziness

integer

not null

0

file_types

text

paths

text

error_message

text

batch_results

integer

0

error_query

text

query_template

text

nb_queries

integer

0

uri

text

nb_queries_without_results

integer

  • batch_search_published btree (published)

  • batch_search_user_id btree (user_id)

  • batch_search_published btree (published)

  • batch_search_user_id btree (user_id)

  • Referenced by:

  • TABLE batch_search_project CONSTRAINT batch_search_project_batch_search_uuid_fk FOREIGN KEY (search_uuid) REFERENCES batch_search(uuid)

  • query_results

    integer

    0

    doc_id

    character varying(96)

    not null

    root_id

    character varying(96)

    not null

    doc_path

    character varying(4096)

    not null

    creation_date

    timestamp without time zone

    content_type

    character varying(255)

    content_length

    bigint

    prj_id

    character varying(96)

    content

    text

    metadata

    text

    status

    smallint

    extraction_level

    smallint

    language

    character(2)

    extraction_date

    timestamp without time zone

    parent_id

    character varying(96)

    root_id

    character varying(96)

    content_type

    character varying(256)

    content_length

    bigint

    charset

    character varying(32)

    ner_mask

    smallint

    user_id

    character varying(255)

    creation_date

    timestamp without time zone

    not null

    '1970-01-01 00:00:00'::timestamp without time zone

    idx_document_tag_unique UNIQUE, btree (doc_id, label)

    creation_date

    timestamp without time zone

    now()

  • idx_document_mark_read_unique UNIQUE, btree (doc_id, user_id, prj_id)

  • idx_document_star_unique UNIQUE, btree (doc_id, user_id, prj_id)

  • extractor

    smallint

    not null

    category

    character varying(8)

    doc_id

    character varying(96)

    not null

    root_id

    character varying(96)

    extractor_language

    character(2)

    hidden

    boolean

    variant

    character varying(16)

    blur_sensitive_media

    boolean

    not null

    false

    label

    character varying(255)

    publisher_name

    character varying(255)

    ''::character varying

    maintainer_name

    character varying(255)

    ''::character varying

    source_url

    character varying(2048)

    ''::character varying

    logo_url

    character varying(2048)

    ''::character varying

    creation_date

    timestamp without time zone

    now()

    update_date

    timestamp without time zone

    now()

    description

    character varying(4096)

    ''::character varying

    user_id

    character varying(96)

    group_id

    character varying(128)

    progress

    double precision

    0

    created_at

    timestamp without time zone

    not null

    completed_at

    timestamp without time zone

    retries_left

    integer

    max_retries

    integer

    args

    text

    result

    text

    error

    text

    task_name btree (name)

  • task_state btree (state)

  • task_user_id btree (user_id)

  • user_id

    character varying(96)

    not null

    type

    smallint

    not null

    name

    text

    uri

    text

    not null

  • user_history_type btree (type)

  • user_history_user_id btree (user_id)

  • user_history_type btree (type)

  • user_history_user_id btree (user_id)

  • Referenced by:

  • TABLE user_history_project CONSTRAINT user_history_project_user_history_id_fk FOREIGN KEY (user_history_id) REFERENCES user_history(id)

  • provider

    character varying(255)

    details

    text

    '{}'::text

    write

    boolean

    not null

    admin

    boolean

    not null

    character varying(96)

    not null

    user_id

    character varying(96)

    not null

    creation_date

    timestamp without time zone

    not null

    uuid

    character(36)

    not null

    name

    character varying(255)

    description

    character varying(4096)

    search_uuid

    character(36)

    not null

    prj_id

    character varying(96)

    not null

    search_uuid

    character(36)

    not null

    query_number

    integer

    not null

    query

    text

    not null

    search_uuid

    character(36)

    not null

    query

    text

    not null

    doc_nb

    integer

    not null

    id

    character varying(96)

    not null

    path

    character varying(4096)

    not null

    project_id

    character varying(96)

    not null

    doc_id

    character varying(96)

    not null

    label

    character varying(64)

    not null

    prj_id

    character varying(96)

    doc_id

    character varying(96)

    not null

    user_id

    character varying(96)

    not null

    prj_id

    character varying(96)

    doc_id

    character varying(96)

    not null

    user_id

    character varying(96)

    not null

    prj_id

    character varying(96)

    id

    character varying(96)

    not null

    mention

    text

    not null

    offsets

    text

    not null

    project_id

    character varying(96)

    not null

    path

    character varying(4096)

    note

    text

    id

    character varying(255)

    not null

    path

    character varying(4096)

    allow_from_mask

    character varying(64)

    id

    character varying(96)

    not null

    name

    character varying(128)

    not null

    state

    character varying(16)

    not null

    id

    integer

    not null

    generated by default as identity

    creation_date

    timestamp without time zone

    not null

    modification_date

    timestamp without time zone

    not null

    user_history_id

    integer

    not null

    prj_id

    character varying(96)

    not null

    id

    character varying(96)

    not null

    email

    text

    name

    character varying(255)

    user_id

    character varying(96)

    not null

    prj_id

    character varying(96)

    not null

    read

    boolean

    not null

    Backend

    Write extensions

    What if you want to add features to Datashare backend?

    Unlike that are providing a way to modify the Datashare frontend, extensions have been created to extend the backend functionalities. There are two extension points that have been defined :

    • NLP pipelines : you can add a new java NLP pipeline to Datashare

    • HTTP API : you can add HTTP endpoints to Datashare and call the Java API you need in those endpoints

    API

    The Datashare API is fully defined using the OpenAPI 3.0 specification and automatically generated after every Datashare release.

    The OpenAPI spec is a language-agnostic, machine-readable document that describes all of the API’s endpoints, parameter and response schemas, security schemes, and metadata. It empowers developers to discover available operations, validate requests and responses, generate client libraries, and power interactive documentation tools.

    You can download the in JSON or explore an instantly browsable, developer-friendly interface .

    latest version of the API definitionarrow-up-right
    with Redocarrow-up-right

    Since version 7.5.0arrow-up-right, instead of modifying Datashare directly, you can now isolate your code with a specific set of features and then configure Datashare to use it. Each Datashare user could pick the extensions they need or want, and have a fully customized installation of our search platform.

    hashtag
    Getting started

    When starting, Datashare can receive an extensionsDir option, pointing to your extensions' directory. In this example, let's call it /home/user/extensions:

    hashtag
    Installing and Removing registered extensions

    hashtag
    Listing

    You can list official Datashare extensions like this :

    You can add a regular expressionarrow-up-right to --extensionList. You can filter the extension list if you know what you are looking for.

    hashtag
    Installing

    You can install an extension with its id and providing where the Datashare extensions are stored:

    Then if you launch Datashare with the same extension location, the extension will be loaded.

    hashtag
    Removing

    When you want to stop using an extension, you can either remove by hand the jar inside the extensions folder or remove it with datashare --extensionDelete :

    hashtag
    Create your first extension

    hashtag
    NLP extension

    You can create a "simple" java project like https://github.com/ICIJ/datashare-extension-nlp-opennlparrow-up-right (as simple as a java project can be right), with you preferred build tool.

    You will have to add a dependency to the last version of datashare-api.jararrow-up-right to be able to implement your NLP pipeline.

    With the datashare API dependency you can then create a class implementing Pipelinearrow-up-right or extending AbstractPipelinearrow-up-right. When Datashare will load the jar, it will look for a Pipeline interface.

    Unfortunately, you'll have also to make a pull request to datashare-api to add a new type of pipeline. We will removearrow-up-right this step in the future.

    Build the jar with its dependencies, and install it in the /home/user/extensions then start datashare with the extensionsDir set to /home/user/extensions. Your plugin will be loaded by datashare.

    Finally, your pipeline will be listed in the available pipelines in the UI, when doing NER.

    hashtag
    HTTP extension

    For making a HTTP extension it will be the same as NLP, you'll have to make a java project that will build a jar. The only dependency that you will need is fluent-httparrow-up-right because datashare will look for fluent http annotations @Get, @Post, @Put...

    For example, we can create a small class like :

    Build the jar, copy it to the /home/user/extensions then start datashare:

    et voilà 🔮 ! You can query your new endpoint. Easy, right?

    hashtag
    Installing and Removing your custom Extension

    You can also install and remove extensions with the Datashare CLI.

    Then you can install it with:

    And remove it:

    plugins
    mkdir /home/user/extensions
    datashare --extensionsDir=/home/user/extensions
    $ datashare -m CLI --extensionList
    2020-08-29 09:27:51,219 [main] INFO  Main - Running datashare 
    extension datashare-extension-nlp-opennlp
            OPENNLP Pipeline
            7.0.0
            https://github.com/ICIJ/datashare-extension-nlp-opennlp/releases/download/7.0.0/datashare-nlp-opennlp-7.0.0-jar-with-dependencies.jar
            Extension to extract NER entities with OPENNLP
            NLP
    ...
    $ datashare -m CLI --extensionInstall datashare-extension-nlp-mitie --extensionsDir "/home/user/extensions"
    2020-08-29 09:34:30,927 [main] INFO  Main - Running datashare 
    2020-08-29 09:34:32,632 [main] INFO  Extension - downloading from url https://github.com/ICIJ/datashare-extension-nlp-mitie/releases/download/7.0.0/datashare-nlp-mitie-7.0.0-jar-with-dependencies.jar
    2020-08-29 09:34:36,324 [main] INFO  Extension - installing extension from file /tmp/tmp218535941624710718.jar into /home/user/extensions
    $ datashare -m CLI --extensionDelete datashare-extension-nlp-mitie --extensionsDir "/home/user/extensions/"
    2020-08-29 09:40:11,033 [main] INFO  Main - Running datashare 
    2020-08-29 09:40:11,249 [main] INFO  Extension - removing extension datashare-extension-nlp-mitie jar /home/user/extensions/datashare-nlp-mitie-7.0.0-jar-with-dependencies.jar
    package org.myorg;
    
    import net.codestory.http.annotations.Get;
    import net.codestory.http.annotations.Prefix;
    
    @Prefix("myorg")
    public class FooResource {
        @Get("foo")
        public String getFoo() {
            return "hello from foo extension";
        }
    }
    $ datashare --extensionsDir /home/user/extensions/
    # ... starting logs
    2020-08-29 11:03:59,776 [Thread-0] INFO  ExtensionLoader - loading jar /home/user/extensions/my-extension.jar
    2020-08-29 11:03:59,779 [Thread-0] INFO  CorsFilter - adding Cross-Origin Request filter allows *
    2020-08-29 11:04:00,314 [Thread-0] INFO  Fluent - Production mode
    2020-08-29 11:04:00,331 [Thread-0] INFO  Fluent - Server started on port 8080
    $ curl localhost:8080/myorg/foo
    hello from foo extension
    $ datashare -m CLI --extensionInstall /home/user/src/my-extension/dist/my-extension.jar --extensionsDir "/home/user/extensions"
    2020-07-27 10:02:32,381 [main] INFO  Main - Running datashare 
    2020-07-27 10:02:32,596 [main] INFO  ExtensionService - installing extension from file /home/user/src/my-extension/dist/my-extension.jar into /home/user/extensions
    $ datashare -m CLI --extensionDelete my-extension.jar --extensionsDir "/home/user/extensions"
    2020-08-29 10:45:37,363 [main] INFO  Main - Running datashare 
    2020-08-29 10:45:37,579 [main] INFO  Extension - removing extension my-extension jar /home/user/extensions/my-extension.jar