Hands-On with Milvus Vector Database

v3.5.9: Pulling from coreos/etcd dd5ad9c9c29f: Pull complete 960043b8858c: Pull complete b4ca4c215f48: Pull complete eebb06941f3e: Pull complete 02cd68c0cbf6: Pull complete d3c894b5b2b0: Pull complete b40161cd83fc: Pull complete 46ba3f23f1d3: Pull complete 4fa131a1b726: Pull complete 654ed51d2180: Pull complete 5f673cfffa4e: Pull complete daea03978d14: Pull complete 1191a2487b77: Pull complete eec65f887b31: Pull complete Digest: sha256:18ca110b5ce9a177bb80d6b4a08d73bda54b549d7a0eb6f66e6da69bf919c63f Status: Downloaded newer image for quay.io/coreos/etcd:v3.5.9

RELEASE.2023-09-30T07-02-29Z: Pulling from minio/minio 0cbafc6a7793: Pull complete 0c06e955dc3b: Pull complete 5bf3b024e1b0: Pull complete c3041f06b66f: Pull complete 1ef03837ebc0: Pull complete 9d74ffa4e082: Pull complete Digest: sha256:6262bc9a2730eeaf16be1bf436a3c2bca2ab76639f113778601a9f89c1485b56 Status: Downloaded newer image for minio/minio:RELEASE.2023-09-30T07-02-29Z docker.io/minio/minio:RELEASE.2023-09-30T07-02-29Z

v2.3.1: Pulling from milvusdb/milvus d5fd17ec1767: Pull complete 165ae08a30c6: Pull complete cad7a9c60b89: Pull complete d4570c56711f: Pull complete a87594eaaa1d: Pull complete 1f27396f6efc: Pull complete fe556ec02776: Pull complete Digest: sha256:9f37c7100c44be8e7419c55f285c2963ee4d25c71951a2325614a4565a03d7e6 Status: Downloaded newer image for milvusdb/milvus:v2.3.1 docker.io/milvusdb/milvus:v2.3.1

--2023-10-06 20:35:29-- https://nlp.stanford.edu/data/glove.6B.zip Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140 Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected. HTTP request sent, awaiting response... 301 Moved Permanently Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following] --2023-10-06 19:35:29-- https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22 Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 862182613 (822M) [application/zip] Saving to: '/tmp/glove.6B.zip' glove.6B.zip 100%[==========================================================>] 822.24M 5.02MB/s in 2m 39s 2023-10-06 20:38:09 (5.17 MB/s) - '/tmp/glove.6B.zip' saved [862182613/862182613]

version: '3.5' services: etcd: container_name: milvus-etcd image: quay.io/coreos/etcd:v3.5.9 environment: - ETCD_AUTO_COMPACTION_MODE=revision - ETCD_AUTO_COMPACTION_RETENTION=1000 - ETCD_QUOTA_BACKEND_BYTES=4294967296 - ETCD_SNAPSHOT_COUNT=50000 user: "1000:1000" volumes: - ${MILVUS_HOME}/etcd:/etcd command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd deploy: resources: limits: memory: 4G reservations: memory: 2G minio: container_name: milvus-minio image: minio/minio:RELEASE.2023-09-30T07-02-29Z environment: MINIO_ROOT_USER: minioadmin MINIO_ROOT_PASSWORD: minioadmin user: "1000:1000" volumes: - ${MILVUS_HOME}/minio:/minio_data command: minio server /minio_data healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] interval: 30s timeout: 20s retries: 3 deploy: resources: limits: memory: 4G reservations: memory: 2G standalone: container_name: milvus-standalone image: milvusdb/milvus:v2.3.1 command: ["milvus", "run", "standalone"] environment: ETCD_ENDPOINTS: etcd:2379 MINIO_ADDRESS: minio:9000 user: "1000:1000" volumes: - ${MILVUS_HOME}/milvus:/var/lib/milvus ports: - "19530:19530" - "9091:9091" depends_on: - "etcd" - "minio" deploy: resources: limits: memory: 8G reservations: memory: 4G networks: default: name: milvus

[+] Running 4/4 - Network milvus Created 0.1s - Container milvus-etcd Created 0.1s - Container milvus-minio Created 0.1s - Container milvus-standalone Created 0.0s Attaching to milvus-etcd, milvus-minio, milvus-standalone milvus-etcd | {"level":"info","ts":"2023-10-07T16:45:32.668024Z","caller":"flags/flag.go:113","msg":"recognized and used environment variable","variable-name":"ETCD_AUTO_COMPACTION_MODE","variable-value":"revision"} milvus-etcd | {"level":"info","ts":"2023-10-07T16:45:32.668353Z","caller":"flags/flag.go:113","msg":"recognized and used environment variable","variable-name":"ETCD_AUTO_COMPACTION_RETENTION","variable-value":"1000"} milvus-etcd | {"level":"info","ts":"2023-10-07T16:45:32.668402Z","caller":"flags/flag.go:113","msg":"recognized and used environment variable","variable-name":"ETCD_QUOTA_BACKEND_BYTES","variable-value":"4294967296"} milvus-etcd | {"level":"info","ts":"2023-10-07T16:45:32.668412Z","caller":"flags/flag.go:113","msg":"recognized and used environment variable","variable-name":"ETCD_SNAPSHOT_COUNT","variable-value":"50000"} milvus-etcd | {"level":"warn","ts":"2023-10-07T16:45:32.669197Z","caller":"embed/config.go:673","msg":"Running http and grpc server on single port. This is not recommended for production."} milvus-etcd | {"level":"info","ts":"2023-10-07T16:45:32.669472Z","caller":"etcdmain/etcd.go:73","msg":"Running: ","args":["etcd","-advertise-client-urls=http://127.0.0.1:2379","-listen-client-urls","http://0.0.0.0:2379","--data-dir","/etcd"]} milvus-etcd | {"level":"warn","ts":"2023-10-07T16:45:32.669524Z","caller":"embed/config.go:673","msg":"Running http and grpc server on single port. This is not recommended for production."} milvus-etcd | {"level":"info","ts":"2023-10-07T16:45:32.669531Z","caller":"embed/etcd.go:127","msg":"configuring peer listeners","listen-peer-urls":["http://localhost:2380"]} milvus-etcd | {"level":"info","ts":"2023-10-07T16:45:32.670701Z","caller":"embed/etcd.go:135","msg":"configuring client listeners","listen-client-urls":["http://0.0.0.0:2379"]} milvus-etcd | {"level":"info","ts":"2023-10-07T16:45:32.670783Z","caller":"embed/etcd.go:309","msg":"starting an etcd server","etcd-version":"3.5.9","git-sha":"bdbbde998","go-version":"go1.19.9","go-os":"linux","go-arch":"amd64","max-cpu-set":16,"max-cpu-available":16,"member-initialized":false,"name":"default","data-dir":"/etcd","wal-dir":"","wal-dir-dedicated":"","member-dir":"/etcd/member","force-new-cluster":false,"heartbeat-interval":"100ms","election-timeout":"1s","initial-election-tick-advance":true,"snapshot-count":50000,"max-wals":5,"max-snapshots":5,"snapshot-catchup-entries":5000,"initial-advertise-peer-urls":["http://localhost:2380"],"listen-peer-urls":["http://localhost:2380"],"advertise-client-urls":["http://127.0.0.1:2379"],"listen-client-urls":["http://0.0.0.0:2379"],"listen-metrics-urls":[],"cors":["*"],"host-whitelist":["*"],"initial-cluster":"default=http://localhost:2380","initial-cluster-state":"new","initial-cluster-token":"etcd-cluster","quota-backend-bytes":4294967296,"max-request-bytes":1572864,"max-concurrent-streams":4294967295,"pre-vote":true,"initial-corrupt-check":false,"corrupt-check-time-interval":"0s","compact-check-time-enabled":false,"compact-check-time-interval":"1m0s","auto-compaction-mode":"revision","auto-compaction-retention":"1µs","auto-compaction-interval":"1µs","discovery-url":"","discovery-proxy":"","downgrade-check-interval":"5s"} --- [ SNIP ] --- milvus-minio | MinIO Object Storage Server milvus-minio | Copyright: 2015-2023 MinIO, Inc. milvus-minio | License: GNU AGPLv3 <https://www.gnu.org/licenses/agpl-3.0.html> milvus-minio | Version: RELEASE.2023-09-30T07-02-29Z (go1.21.1 linux/amd64) milvus-minio | milvus-minio | Status: 1 Online, 0 Offline. milvus-minio | S3-API: http://172.18.0.3:9000 http://127.0.0.1:9000 milvus-minio | Console: http://172.18.0.3:34531 http://127.0.0.1:34531 milvus-minio | milvus-minio | Documentation: https://min.io/docs/minio/linux/index.html --- [ SNIP ] --- milvus-standalone | __ _________ _ ____ ______ milvus-standalone | / |/ / _/ /| | / / / / / __/ milvus-standalone | / /|_/ // // /_| |/ / /_/ /\ \ milvus-standalone | /_/ /_/___/____/___/\____/___/ milvus-standalone | milvus-standalone | Welcome to use Milvus! milvus-standalone | Version: v2.3.1 milvus-standalone | Built: Fri Sep 22 11:37:42 UTC 2023 milvus-standalone | GitCommit: 70cf65b05 milvus-standalone | GoVersion: go version go1.20.7 linux/amd64 milvus-standalone | milvus-standalone | open pid file: /tmp/milvus/standalone.pid milvus-standalone | lock pid file: /tmp/milvus/standalone.pid milvus-standalone | [2023/10/07 16:45:33.038 +00:00] [INFO] [roles/roles.go:294] ["starting running Milvus components"] milvus-standalone | [2023/10/07 16:45:33.039 +00:00] [INFO] [roles/roles.go:164] ["Enable Jemalloc"] ["Jemalloc Path"=/milvus/lib/libjemalloc.so] milvus-standalone | [2023/10/07 16:45:33.043 +00:00] [INFO] [config/refresher.go:66] ["start refreshing configurations"] [source=FileSource] milvus-standalone | [2023/10/07 16:45:33.043 +00:00] [DEBUG] [config/etcd_source.go:49] ["init etcd source"] [etcdInfo="{\"UseEmbed\":false,\"UseSSL\":false,\"Endpoints\":[\"etcd:2379\"],\"KeyPrefix\":\"by-dev\",\"CertFile\":\"/path/to/etcd-client.pem\",\"KeyFile\":\"/path/to/etcd-client-key.pem\",\"CaCertFile\":\"/path/to/ca.pem\",\"MinVersion\":\"1.3\",\"RefreshInterval\":5000000000}"] --- [ SNIP ] --- milvus-standalone | [2023/10/07 16:45:42.228 +00:00] [INFO] [gc/gc_tuner.go:90] ["GC Tune done"] ["previous GOGC"=100] ["heapuse "=30] ["total memory"=4528] ["next GC"=42] ["new GOGC"=200] [gc-pause=133.429µs] [gc-pause-end=1696697142228228341]

CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES f9613aae4eb0 milvusdb/milvus:v2.3.1 "/tini -- milvus run..." 2 minutes ago Up 2 minutes 0.0.0.0:9091->9091/tcp, :::9091->9091/tcp, 0.0.0.0:19530->19530/tcp, :::19530->19530/tcp milvus-standalone 68516a50772f minio/minio:RELEASE.2023-09-30T07-02-29Z "/usr/bin/docker-ent..." 2 minutes ago Up 2 minutes (healthy) 9000/tcp milvus-minio 627a9207610e quay.io/coreos/etcd:v3.5.9 "etcd -advertise-cli..." 2 minutes ago Up 2 minutes 2379-2380/tcp milvus-etcd

from gensim.models import KeyedVectors glove_6B_200d = os.path.join(milvus_home, 'glove/glove.6B.200d.txt') w2v_model = KeyedVectors.load_word2vec_format(glove_6B_200d, binary=False, no_header=True)

from pymilvus import FieldSchema, CollectionSchema, DataType, Collection def create_collection(alias, collection_name): def_news_category = 'General' # Define the fields of the schema news_id = FieldSchema( name='news_id', dtype=DataType.INT64, is_primary=True ) news_category = FieldSchema( name='news_category', dtype=DataType.VARCHAR, max_length=15, default_value=def_news_category ) news_snippet_text = FieldSchema( name='news_snippet_text', dtype=DataType.VARCHAR, max_length=256 ) news_snippet_vector = FieldSchema( name='news_snippet_vector', dtype=DataType.FLOAT_VECTOR, dim=200 ) # Define the collection collection_schema = CollectionSchema( fields=[news_id, news_category, news_snippet_text, news_snippet_vector], description="News Snippet Search", enable_dynamic_field=True ) # Create the collection collection = Collection( name=collection_name, schema=collection_schema, using=alias ) # Return the collection return collection db.using_database(db_name) db_collection = 'news_snippets' collection = create_collection(db_alias, db_collection)

def create_index(field, collection): index_params = { 'metric_type': 'L2', 'index_type': 'IVF_FLAT', 'params': {'nlist': 128} } collection.create_index(field, index_params) field_name = 'news_snippet_vector' create_index(field_name, collection)

def get_new_snippets(filename): with open(filename) as file: lines = [line.split('|') for line in file] return lines new_snippets_file = os.path.join(milvus_home, 'news-snippets.txt') ns_lines = get_new_snippets(new_snippets_file) def get_sentence_embedding(sentence, tokenizer, model): tokens = tokenizer.tokenize(sentence) words = [word.lower() for word in tokens if word.isalpha()] embedding = model.get_mean_vector(words) return embedding id_106 = '106' vec_106 = None id_112 = '112' vec_112 = None for line in ns_lines: embedding = get_sentence_embedding(line[2].strip(), tokenizer, model) item = [[int(line[0])], [line[1]], [line[2]], [embedding]] if line[0] == id_106: vec_106 = embedding if line[0] == id_112: vec_112 = embedding collection.insert(item) collection.flush()

#
# @Author: Bhaskar S
# @Blog:   https://www.polarsparc.com
# @Date:   07 Oct 2023
#

import os
from gensim.models import KeyedVectors
from nltk.tokenize import WordPunctTokenizer
from pymilvus import connections, db
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection


def get_new_snippets(filename):
  with open(filename) as file:
    lines = [line.split('|') for line in file]
  return lines


def init_embedding_model(filename):
  w2v_model = KeyedVectors.load_word2vec_format(filename, binary=False, no_header=True)
  return w2v_model


# Function to return the vector embedding for a given sentence
def get_sentence_embedding(sentence, tokenizer, model):
  tokens = tokenizer.tokenize(sentence)
  words = [word.lower() for word in tokens if word.isalpha()]
  embedding = model.get_mean_vector(words)
  return embedding


# Function to create a Milvus collection
def create_collection(alias, collection_name):
  def_news_category = 'General'
  # Define the fields of the schema
  news_id = FieldSchema(
    name='news_id',
    dtype=DataType.INT64,
    is_primary=True
  )
  news_category = FieldSchema(
    name='news_category',
    dtype=DataType.VARCHAR,
    max_length=15,
    default_value=def_news_category
  )
  news_snippet_text = FieldSchema(
    name='news_snippet_text',
    dtype=DataType.VARCHAR,
    max_length=256
  )
  news_snippet_vector = FieldSchema(
    name='news_snippet_vector',
    dtype=DataType.FLOAT_VECTOR,
    dim=200
  )
  # Define the collection
  collection_schema = CollectionSchema(
    fields=[news_id, news_category, news_snippet_text, news_snippet_vector],
    description="News Snippet Search",
    enable_dynamic_field=True
  )
  # Create the collection
  collection = Collection(
    name=collection_name,
    schema=collection_schema,
    using=alias
  )
  # Return the collection
  return collection


# Function to create an index on the specified field of the given collection
def create_index(field, collection):
  index_params = {
    'metric_type': 'L2',
    'index_type': 'IVF_FLAT',
    'params': {'nlist': 128}
  }
  collection.create_index(field, index_params)


# Function that performs a Similarity search on the given collection
def search_collection(field, vector, collection, limit=3):
  search_params = {
    'data': [vector],
    'anns_field': field,
    'limit': limit,
    'param': {'metric_type': 'L2'}
  }
  results = collection.search(**search_params)
  return results


# Main function
def main():
  milvus_home = os.getenv('MILVUS_HOME')
  print(f'milvus_home: {milvus_home}')

  glove_6B_200d = os.path.join(milvus_home, 'glove/glove.6B.200d.txt')
  print(f'glove_6B_200d: {glove_6B_200d}')

  print(f'+++ Initializing {glove_6B_200d} embedding model')
  model = init_embedding_model(glove_6B_200d)

  tokenizer = WordPunctTokenizer()

  new_snippets_file = os.path.join(milvus_home, 'news-snippets.txt')
  print(f'+++ Reading news snippets from {new_snippets_file}')
  ns_lines = get_new_snippets(new_snippets_file)

  db_alias = 'default'
  db_name = 'mytest'
  db_collection = 'news_snippets'

  print(f'+++ Connecting to the standalone Milvus Vector DB')
  connections.connect(alias=db_alias, host='localhost', port='19530')

  db_list = db.list_database()
  print(f'+++ List of databases (before) - {db_list}')

  print(f'+++ Creating the DB - {db_name}')
  db.create_database(db_name)

  db_list = db.list_database()
  print(f'+++ List of databases (after) - {db_list}')

  print(f'+++ Using the DB - {db_name}')
  db.using_database(db_name)

  print(f'+++ Creating collection - {db_collection}')
  collection = create_collection(db_alias, db_collection)

  print(f'+++ Number of entries in collection {db_collection}: {collection.num_entities}')

  field_name = 'news_snippet_vector'
  print(f'+++ Creating index on collection {db_collection} for field: {field_name}')
  create_index(field_name, collection)

  id_106 = '106'
  vec_106 = None
  id_112 = '112'
  vec_112 = None

  print(f'+++ Inserting news snippets into collection {db_collection}')
  for line in ns_lines:
    embedding = get_sentence_embedding(line[2].strip(), tokenizer, model)
    item = [[int(line[0])], [line[1]], [line[2]], [embedding]]
    if line[0] == id_106:
      vec_106 = embedding
    if line[0] == id_112:
      vec_112 = embedding
    collection.insert(item)
  collection.flush()

  print(f'+++ Number of entries in collection {db_collection}: {collection.num_entities}')

  print(f'+++ [1] Loading collection {db_collection} to memory')
  collection.load()

  print(f'+++ Searching Health news snippets similar to #106 from collection {db_collection}')
  results = search_collection(field_name, vec_106, collection)
  for i, result in enumerate(results):
    for j, res in enumerate(result):
      print(f'[1] ---> {res}')

  print(f'+++ [1] Searching Technology news snippets similar to #112 from collection {db_collection}')
  results = search_collection(field_name, vec_112, collection)
  for i, result in enumerate(results):
    for j, res in enumerate(result):
      print(f'---> {res}')

  print(f'+++ [1] Upserting #124 dummy news snippets into collection {db_collection}')
  line_1 = 'No interesting Tech news'
  embedding_1 = get_sentence_embedding(line_1, tokenizer, model)
  item = [[124], ['Technology'], [line_1], [embedding_1]]
  collection.upsert(item)
  collection.flush()

  print(f'+++ [2] Loading collection {db_collection} to memory')
  collection.load()

  print(f'+++ [2] Searching Technology news snippets similar to #112 from collection {db_collection}')
  results = search_collection(field_name, vec_112, collection, limit=5)
  for i, result in enumerate(results):
    for j, res in enumerate(result):
      print(f'[2] ---> {res}')

  print(f'+++ [2] Upserting #124 real news snippets in collection {db_collection}')
  line_2 = 'Amazon to Invest resources in its AI Platform'
  embedding_2 = get_sentence_embedding(line_2, tokenizer, model)
  item = [[124], ['Technology'], [line_2], [embedding_2]]
  collection.upsert(item)
  collection.flush()

  print(f'+++ [3] Loading collection {db_collection} to memory')
  collection.load()

  print(f'+++ [3] Searching Technology news snippets similar to #112 from collection {db_collection}')
  results = search_collection(field_name, vec_112, collection, limit=5)
  for i, result in enumerate(results):
    for j, res in enumerate(result):
      print(f'[3] ---> {res}')

  print(f'+++ Deleting #124 news snippets from collection {db_collection}')
  expr = 'news_id in [124]'
  collection.delete(expr)
  collection.flush()

  print(f'+++ [4] Searching Technology news snippets similar to #112 from collection {db_collection}')
  results = search_collection(field_name, vec_112, collection)
  for i, result in enumerate(results):
    for j, res in enumerate(result):
      print(f'[4] ---> {res}')

  print(f'--- Releasing collection {db_collection} from memory')
  collection.release()

  print(f'--- Dropping index for  collection {db_collection}')
  collection.drop_index()

  print(f'--- Dropping collection {db_collection}')
  collection.drop()

  print(f'--- Dropping the DB - {db_name}')
  db.drop_database(db_name)

  print(f'--- Disconnecting from the standalone Milvus Vector DB')
  connections.disconnect(db_alias)


if __name__ == '__main__':
    main()

milvus_home: /home/alice/milvus glove_6B_200d: /home/alice/milvus/glove/glove.6B.200d.txt +++ Initializing /home/alice/milvus/glove/glove.6B.200d.txt embedding model +++ Reading news snippets from /home/alice/milvus/news-snippets.txt +++ Connecting to the standalone Milvus Vector DB +++ List of databases (before) - ['default'] +++ Creating the DB - mytest +++ List of databases (after) - ['default', 'mytest'] +++ Using the DB - mytest +++ Creating collection - news_snippets +++ Number of entries in collection news_snippets: 0 +++ Creating index on collection news_snippets for field: news_snippet_vector +++ Inserting news snippets into collection news_snippets +++ Number of entries in collection news_snippets: 23 +++ [1] Loading collection news_snippets to memory +++ Searching Health news snippets similar to #106 from collection news_snippets [1] ---> id: 106, distance: 0.0, entity: {} [1] ---> id: 108, distance: 0.1292576789855957, entity: {} [1] ---> id: 107, distance: 0.14604675769805908, entity: {} +++ [1] Searching Technology news snippets similar to #112 from collection news_snippets ---> id: 112, distance: 0.0, entity: {} ---> id: 114, distance: 0.15123721957206726, entity: {} ---> id: 115, distance: 0.16058120131492615, entity: {} +++ [1] Upserting #124 dummy news snippets into collection news_snippets +++ [2] Loading collection news_snippets to memory +++ [2] Searching Technology news snippets similar to #112 from collection news_snippets [2] ---> id: 112, distance: 0.0, entity: {} [2] ---> id: 114, distance: 0.15123721957206726, entity: {} [2] ---> id: 115, distance: 0.16058120131492615, entity: {} [2] ---> id: 117, distance: 0.164011150598526, entity: {} [2] ---> id: 116, distance: 0.18734167516231537, entity: {} +++ [2] Upserting #124 real news snippets in collection news_snippets +++ [3] Loading collection news_snippets to memory +++ [3] Searching Technology news snippets similar to #112 from collection news_snippets [3] ---> id: 112, distance: 0.0, entity: {} [3] ---> id: 114, distance: 0.15123721957206726, entity: {} [3] ---> id: 115, distance: 0.16058120131492615, entity: {} [3] ---> id: 117, distance: 0.164011150598526, entity: {} [3] ---> id: 124, distance: 0.17247149348258972, entity: {} +++ Deleting #124 news snippets from collection news_snippets +++ [4] Searching Technology news snippets similar to #112 from collection news_snippets [4] ---> id: 112, distance: 0.0, entity: {} [4] ---> id: 114, distance: 0.15123721957206726, entity: {} [4] ---> id: 115, distance: 0.16058120131492615, entity: {} --- Releasing collection news_snippets from memory --- Dropping index for collection news_snippets --- Dropping collection news_snippets --- Dropping the DB - mytest --- Disconnecting from the standalone Milvus Vector DB

Output.1

Output.2

Output.3

Output.4

Output.5

Output.6

Output.7

Output.8