go.od_planter
[ollama] embedding maximum batch size = 41666 본문
1160분,, 거의 20시간 가까이 돌아간 Ollama embedding 작업 중 오류가 떴다...
결론은 maximum batch size가 존재한다는것..
{
"name": "ValueError",
"message": "Batch size 61179 exceeds maximum batch size 41666",
"stack": "---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[28], line 5
2 from langchain.vectorstores import Chroma
4 oembed = OllamaEmbeddings(base_url=\"http://localhost:11434\", model = 'llama3.1:latest')
----> 5 vectorsotre = Chroma.from_documents(documents=texts, embedding=oembed)
File /opt/homebrew/anaconda3/envs/llm-dacon/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py:878, in Chroma.from_documents(cls, documents, embedding, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs)
876 texts = [doc.page_content for doc in documents]
877 metadatas = [doc.metadata for doc in documents]
--> 878 return cls.from_texts(
879 texts=texts,
880 embedding=embedding,
881 metadatas=metadatas,
882 ids=ids,
883 collection_name=collection_name,
884 persist_directory=persist_directory,
885 client_settings=client_settings,
886 client=client,
887 collection_metadata=collection_metadata,
888 **kwargs,
889 )
File /opt/homebrew/anaconda3/envs/llm-dacon/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py:842, in Chroma.from_texts(cls, texts, embedding, metadatas, ids, collection_name, persist_directory, client_settings, client, collection_metadata, **kwargs)
836 chroma_collection.add_texts(
837 texts=batch[3] if batch[3] else [],
838 metadatas=batch[2] if batch[2] else None,
839 ids=batch[0],
840 )
841 else:
--> 842 chroma_collection.add_texts(texts=texts, metadatas=metadatas, ids=ids)
843 return chroma_collection
File /opt/homebrew/anaconda3/envs/llm-dacon/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py:313, in Chroma.add_texts(self, texts, metadatas, ids, **kwargs)
311 raise ValueError(e.args[0] + \"\
\
\" + msg)
312 else:
--> 313 raise e
314 if empty_ids:
315 texts_without_metadatas = [texts[j] for j in empty_ids]
File /opt/homebrew/anaconda3/envs/llm-dacon/lib/python3.10/site-packages/langchain_community/vectorstores/chroma.py:299, in Chroma.add_texts(self, texts, metadatas, ids, **kwargs)
297 ids_with_metadata = [ids[idx] for idx in non_empty_ids]
298 try:
--> 299 self._collection.upsert(
300 metadatas=metadatas,
301 embeddings=embeddings_with_metadatas,
302 documents=texts_with_metadatas,
303 ids=ids_with_metadata,
304 )
305 except ValueError as e:
306 if \"Expected metadata value to be\" in str(e):
File /opt/homebrew/anaconda3/envs/llm-dacon/lib/python3.10/site-packages/chromadb/api/models/Collection.py:300, in Collection.upsert(self, ids, embeddings, metadatas, documents, images, uris)
279 \"\"\"Update the embeddings, metadatas or documents for provided ids, or create them if they don't exist.
280
281 Args:
(...)
288 None
289 \"\"\"
290 (
291 ids,
292 embeddings,
(...)
297 ids, embeddings, metadatas, documents, images, uris
298 )
--> 300 self._client._upsert(
301 collection_id=self.id,
302 ids=ids,
303 embeddings=embeddings,
304 metadatas=metadatas,
305 documents=documents,
306 uris=uris,
307 )
File /opt/homebrew/anaconda3/envs/llm-dacon/lib/python3.10/site-packages/chromadb/telemetry/opentelemetry/__init__.py:146, in trace_method.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
144 global tracer, granularity
145 if trace_granularity < granularity:
--> 146 return f(*args, **kwargs)
147 if not tracer:
148 return f(*args, **kwargs)
File /opt/homebrew/anaconda3/envs/llm-dacon/lib/python3.10/site-packages/chromadb/api/segment.py:429, in SegmentAPI._upsert(self, collection_id, ids, embeddings, metadatas, documents, uris)
427 coll = self._get_collection(collection_id)
428 self._manager.hint_use_collection(collection_id, t.Operation.UPSERT)
--> 429 validate_batch(
430 (ids, embeddings, metadatas, documents, uris),
431 {\"max_batch_size\": self.get_max_batch_size()},
432 )
433 records_to_submit = []
434 for r in _records(
435 t.Operation.UPSERT,
436 ids=ids,
(...)
440 uris=uris,
441 ):
File /opt/homebrew/anaconda3/envs/llm-dacon/lib/python3.10/site-packages/chromadb/api/types.py:541, in validate_batch(batch, limits)
530 def validate_batch(
531 batch: Tuple[
532 IDs,
(...)
538 limits: Dict[str, Any],
539 ) -> None:
540 if len(batch[0]) > limits[\"max_batch_size\"]:
--> 541 raise ValueError(
542 f\"Batch size {len(batch[0])} exceeds maximum batch size {limits['max_batch_size']}\"
543 )
ValueError: Batch size 61179 exceeds maximum batch size 41666"
}