Skip to content
Snippets Groups Projects
Commit 343a3dc4 authored by Jonas Röger's avatar Jonas Röger
Browse files

Facts4Chat v1.0.0

parents
Branches main
No related tags found
No related merge requests found
Pipeline #256031 passed
.vscode/*
\ No newline at end of file
stages:
- openapi
- wheel
# Build a python3.11 image with openapi-generator-cli
# @NoArtifactsToSource
openapi:
stage: openapi
image:
name: openapitools/openapi-generator-cli:v7.0.1
script:
- docker-entrypoint.sh generate -i openapi/database.yaml -g python -o out/python -c openapi/openapi-generator-config.yaml -p packageVersion=${CI_COMMIT_TAG}
artifacts:
paths:
- out/python/
# Generate python sources for api spec and make a wheel for it
wheel:
stage: wheel
image:
name: python:3.11.0
only:
- tags
cache:
paths:
- .cache/pip
variables:
PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
before_script:
- pip install poetry twine
script:
- cd out/python
- poetry config virtualenvs.in-project true
- poetry env use /usr/local/bin/python3.11
- poetry build
- TWINE_PASSWORD=${CI_JOB_TOKEN} TWINE_USERNAME=gitlab-ci-token python -m twine upload --verbose --repository-url ${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/pypi dist/*
FROM python:3.11-bullseye
RUN apt update -y && apt install -y \
openjdk-11-jre-headless \
wget \
&& apt autoremove -y && apt clean -y
RUN pip3.11 install poetry
RUN wget https://repo1.maven.org/maven2/org/openapitools/openapi-generator-cli/7.0.1/openapi-generator-cli-7.0.1.jar \
-O /opt/openapi-generator-cli.jar
variables:
python_image_version: 1.1.0
# Hooks based on
# - https://www.architecture-performance.fr/ap_blog/some-pre-commit-git-hooks-for-python/
# - https://github.com/pre-commit/pre-commit-hooks
#
# All package versions are the latest as of 2021-10-20
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: check-merge-conflict # Check for files that contain merge conflict strings
- id: mixed-line-ending # Replace with most frequent line ending
args:
- "--fix=lf"
- id: no-commit-to-branch # Prevent direct commits to master
- id: trailing-whitespace # Trims trailing whitespace
- id: end-of-file-fixer # Makes sure files end in a newline and only a newline
- repo: https://github.com/compilerla/conventional-pre-commit # enforce conventional commit messages
rev: v2.4.0
hooks:
- id: conventional-pre-commit
stages: [commit-msg]
# Database API
The file [`openapi/database.yaml`](./openapi/database.yaml) contains a OpenAPI specification
for the Database request/response scheme discussed on [24.10.2023](https://gitlab.fachschaften.org/PG-Facts4Chat/sitzungsprotokolle/-/blob/c89834286fce94f03a73b443d681d52949bed9aa/Protokolle/20231024.md).
## Python-Client (openapi-generator)
A `whl` file can be found in the latest [artifacts](https://gitlab.fachschaften.org/PG-Facts4Chat/datacollection/database-api/-/artifacts).
openapi: '3.0.2'
info:
title: Database
description: The indexing and retrieval API of the database.
version: '0.7.0.dev0'
paths:
/retrieval:
post:
requestBody:
content:
application/json:
schema:
$ref: "#/components/schemas/RetrievalRequest"
responses:
'200':
description: 'OK'
content:
application/json:
schema:
$ref: "#/components/schemas/RetrievalResponse"
'415':
description: "The content type is not application/json"
'422':
description: "The content does not validate against the API-Specification."
/info/retrievers:
get:
summary: Get a list of available retrievers
responses:
'200':
description: A list of retrievers
content:
application/json:
schema:
type: array
items:
type: string
/info/rerankers:
get:
summary: Get a list of available rerankers
responses:
'200':
description: A list of rerankers
content:
application/json:
schema:
type: array
items:
type: string
/info/embedders:
get:
summary: Get a list of available embedders
responses:
'200':
description: A list of embedders
content:
application/json:
schema:
type: array
items:
type: string
/info/retrievers/{retriever_name}/indices:
get:
summary: Get a list of indices for a specific retriever
parameters:
- in: path
name: retriever_name
required: true
schema:
type: string
responses:
'200':
description: A list of indices
content:
application/json:
schema:
type: array
items:
type: string
'404':
description: Retriever not found
'500':
description: Internal server error
components:
schemas:
GenericRetriever:
type: object
description: Backwards compatible retriever for development only.
required:
- name
properties:
name:
type: string
enum: [Generic] # Hack for pydantic validation, enforces correct name
args:
type: object
OpenSearchTermRetriever:
type: object
description: The term based OpenSearch retriever (BM25) type specification.
required:
- name
- args
properties:
name:
type: string
enum: [OpenSearchTerm] # Hack for pydantic validation, enforces correct name
args:
type: object
description: Arguments for the term based OpenSearch retriever.
required:
- indices
properties:
indices:
type: array
minItems: 1
items:
type: string
description: A list of indices to search in.
analyzer:
description: A OpenSearch built-in analyzer. English performs stemming and stop-word removal for english sentences.
type: string
default: english
operator:
type: string
description: How to aggregate the found documents for each term. (intersect=and, union=or)
default: or
enum:
- and
- or
fuzziness:
description: Maximum allowed Levenshtein-distance for a word to count as a match.
default: AUTO
oneOf:
- type: string
enum: [AUTO]
- type: integer
minimum: 0
prefix_length:
type: integer
description: Minimum number of characters to exactly match at the beginning of words.
minimum: 0
maximum_results:
type: integer
description: Maximum number of results to retrieve from the database.
default: 50
minimum: 1
minimum_should_match:
type: integer
minimum: 1
description: The minimum number of terms, that sould match for each candidate chunk.
ChromaDBRetriever:
type: object
description: The croma db retriever arguments.
required:
- name
- args
properties:
name:
type: string
enum: [ChromaDB] # Hack for pydantic validation, enforces correct name
args:
type: object
description: Arguments for the chroma db retriever.
required:
- indices
- embedder
properties:
indices:
type: array
minItems: 1
items:
type: string
description: A list of indices to search in. (Internally the database-server will postfix the names with the embedder name)
embedder:
type: string
description: The embedder to use. (This requires prior indexing with this specific embedder)
maximum_results:
type: integer
description: Maximum number of results to retrieve from the database.
default: 50
minimum: 1
Retriever:
type: object
description: The retriever type specification.
discriminator:
propertyName: name
mapping:
OpenSearchTerm: "#/components/schemas/OpenSearchTermRetriever"
Generic: "#/components/schemas/GenericRetriever"
anyOf:
- $ref: "#/components/schemas/OpenSearchTermRetriever"
- $ref: "#/components/schemas/ChromaDBRetriever"
- $ref: "#/components/schemas/GenericRetriever"
RetrievalRequest:
type: object
description: The retrieval request body
required:
- query
- retrieverType
properties:
query:
type: string
description: The retrieval query.
retrieverType:
$ref: "#/components/schemas/Retriever"
rerank:
type: string
description: An optional reranking strategy to run after retrieval.
maxLength:
type: integer
description: An optional (estimated) maximum length of the full response.
minimum: 0
neighborsBefore:
type: integer
description: Specify how many neighboring chunks should be returned before the found chunks.
minimum: 0
neighborsAfter:
type: integer
description: Specify how many neighboring chunks should be returned before the found chunks.
minimum: 0
ChunkMeta:
type: object
description: Metadata of the stored chunks.
Chunk:
type: object
description: A representation of a stored chunk of text.
required:
- content
- index
- document_uid
properties:
content:
type: string
index:
type: integer
minimum: 0
document_uid:
type: string
meta:
$ref: "#/components/schemas/ChunkMeta"
RetrievalResult:
type: object
description: A single retrieval result.
required:
- chunk
- score
properties:
chunk:
$ref: "#/components/schemas/Chunk"
score:
type: number
description: A similarity score between the query and this result, calculated by the retrieval/reranking method.
beforeChunk:
type: array
description: The chunks before the retrieved chunk.
items:
$ref: "#/components/schemas/Chunk"
afterChunk:
type: array
description: The chunks after the retrieved chunk.
items:
$ref: "#/components/schemas/Chunk"
RetrievalResponse:
type: object
description: The retrieval response body
required:
- query
- results
properties:
query:
type: string
description: The query of the retrievl request.
results:
type: array
description: All retrieved results for the query.
items:
$ref: "#/components/schemas/RetrievalResult"
packageName: "database_client"
projectName: "database-client"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment