print message to test

mlc-ai · May 15, 2024 · 8c2933b · 8c2933b
1 parent 25e5265
commit 8c2933b
Show file tree

Hide file tree

Showing 41 changed files with 9,887 additions and 24 deletions.
diff --git a/app/client/webllm.ts b/app/client/webllm.ts
@@ -70,8 +70,9 @@ export class WebLLMApi implements LLMApi {
             options.onUpdate?.(reply, chunk.choices[0].delta.content);
           }
         }
-      } catch (e) {
-        console.error("Error in streaming chatCompletion", e);
+      } catch (err) {
+        console.error("Error in streaming chatCompletion", err);
+        options.onError?.(err as Error);
       }
     } else {
       try {
@@ -80,8 +81,9 @@ export class WebLLMApi implements LLMApi {
           messages: options.messages as ChatCompletionMessageParam[],
         });
         reply = completion.choices[0].message.content;
-      } catch (e) {
-        console.error("Error in streaming chatCompletion", e);
+      } catch (err) {
+        console.error("Error in non-streaming chatCompletion", err);
+        options.onError?.(err as Error);
       }
     }
 

diff --git a/app/store/chat.ts b/app/store/chat.ts
@@ -383,7 +383,7 @@ export const useChatStore = createPersistStore(
             ChatControllerPool.remove(session.id, botMessage.id);
           },
           onError(error) {
-            const isAborted = error.message.includes("aborted");
+            const isAborted = error.message?.includes("aborted") || false;
             botMessage.content +=
               "\n\n" +
               prettyObject({

diff --git a/lib/cache_util.d.ts b/lib/cache_util.d.ts
@@ -0,0 +1,7 @@
+import { AppConfig } from "./config";
+export declare function hasModelInCache(modelId: string, appConfig?: AppConfig): Promise<boolean>;
+export declare function deleteModelAllInfoInCache(modelId: string, appConfig?: AppConfig): Promise<void>;
+export declare function deleteModelInCache(modelId: string, appConfig?: AppConfig): Promise<void>;
+export declare function deleteChatConfigInCache(modelId: string, appConfig?: AppConfig): Promise<void>;
+export declare function deleteModelWasmInCache(modelId: string, appConfig?: AppConfig): Promise<void>;
+//# sourceMappingURL=cache_util.d.ts.map
diff --git a/lib/cache_util.d.ts.map b/lib/cache_util.d.ts.map
diff --git a/lib/config.d.ts b/lib/config.d.ts
@@ -0,0 +1,159 @@
+import { ResponseFormat } from "./openai_api_protocols";
+import { LogitProcessor, InitProgressCallback } from "./types";
+/**
+ * Conversation template config
+ */
+export interface ConvTemplateConfig {
+    system_template: string;
+    system_message: string;
+    roles: Record<Role, string>;
+    role_templates?: Partial<Record<Role, string>>;
+    seps: Array<string>;
+    role_content_sep?: string;
+    role_empty_sep?: string;
+    offset: number;
+    stop_str: Array<string>;
+    system_prefix_token_ids?: Array<number>;
+    stop_token_ids: Array<number>;
+    add_role_after_system_message?: boolean;
+}
+export declare enum Role {
+    user = "user",
+    assistant = "assistant"
+}
+/**
+ * Place holders that can be used in role templates.
+ * For example, a role template of
+ * `<<question>> ${MessagePlaceholders.USER} <<function>> ${MessagePlaceholders.FUNCTION}`
+ * will insert the user message to ${MessagePlaceholders.USER}
+ * and insert the function message to ${MessagePlaceholders.FUNCTION}
+ * at run time.
+ */
+export declare enum MessagePlaceholders {
+    system = "{system_message}",
+    user = "{user_message}",
+    assistant = "{assistant_message}",
+    tool = "{tool_message}",
+    function = "{function_string}"
+}
+/**
+ * Config of one chat model, a data structure representing `mlc-chat-config.json`.
+ * This only corresponds to the chat-related fields and `tokenizer_files` of `mlc-chat-config.json`.
+ * Only these fields affect the conversation in runtime.
+ * i.e. The third part in https://llm.mlc.ai/docs/get_started/mlc_chat_config.html.
+ *
+ * This is initialized in `ChatModule.reload()` with the model's `mlc-chat-config.json`.
+ */
+export interface ChatConfig {
+    tokenizer_files: Array<string>;
+    conv_config?: Partial<ConvTemplateConfig>;
+    conv_template: string | ConvTemplateConfig;
+    mean_gen_len: number;
+    max_gen_len: number;
+    shift_fill_factor: number;
+    repetition_penalty: number;
+    frequency_penalty: number;
+    presence_penalty: number;
+    top_p: number;
+    temperature: number;
+    bos_token_id?: number;
+}
+/**
+ * Custom options that can be used to override known config values.
+ */
+export interface ChatOptions extends Partial<ChatConfig> {
+}
+/**
+ * Optional configurations for `CreateEngine()` and `CreateWebWorkerEngine()`.
+ *
+ * chatOpts: To optionally override the `mlc-chat-config.json` of `modelId`.
+ * appConfig: Configure the app, including the list of models and whether to use IndexedDB cache.
+ * initProgressCallback: A callback for showing the progress of loading the model.
+ * logitProcessorRegistry: A register for stateful logit processors, see `webllm.LogitProcessor`.
+ *
+ * @note All fields are optional, and `logitProcessorRegistry` is only used for `CreateEngine()`
+ * not `CreateWebWorkerEngine()`.
+ */
+export interface EngineConfig {
+    chatOpts?: ChatOptions;
+    appConfig?: AppConfig;
+    initProgressCallback?: InitProgressCallback;
+    logitProcessorRegistry?: Map<string, LogitProcessor>;
+}
+/**
+ * Config for a single generation.
+ * Essentially `ChatConfig` without `tokenizer_files`, `conv_config`, or `conv_template`.
+ * We also support additional fields not present in `mlc-chat-config.json` due to OpenAI-like APIs.
+ *
+ * Note that all values are optional. If unspecified, we use whatever values in `ChatConfig`
+ * initialized during `ChatModule.reload()`.
+ */
+export interface GenerationConfig {
+    mean_gen_len?: number;
+    shift_fill_factor?: number;
+    repetition_penalty?: number;
+    top_p?: number | null;
+    temperature?: number | null;
+    max_gen_len?: number | null;
+    frequency_penalty?: number | null;
+    presence_penalty?: number | null;
+    stop?: string | null | Array<string>;
+    n?: number | null;
+    logit_bias?: Record<string, number> | null;
+    logprobs?: boolean | null;
+    top_logprobs?: number | null;
+    response_format?: ResponseFormat | null;
+}
+export declare function postInitAndCheckGenerationConfigValues(config: GenerationConfig): void;
+/**
+ * Information for a model.
+ * @param model_url: the huggingface link to download the model weights.
+ * @param model_id: what we call the model.
+ * @param model_lib_url: link to the model library (wasm file) the model uses.
+ * @param vram_required_MB: amount of vram in MB required to run the model (can use
+ *    `utils/vram_requirements` to calculate).
+ * @param low_resource_required: whether the model can run on limited devices (e.g. Android phone).
+ * @param buffer_size_required_bytes: required `maxStorageBufferBindingSize`, different for each device.
+ * @param required_features: feature needed to run this model (e.g. shader-f16).
+ */
+export interface ModelRecord {
+    model_url: string;
+    model_id: string;
+    model_lib_url: string;
+    vram_required_MB?: number;
+    low_resource_required?: boolean;
+    buffer_size_required_bytes?: number;
+    required_features?: Array<string>;
+}
+/**
+ * Extra configuration that can be
+ * passed to the load.
+ *
+ * @param model_list: models to be used.
+ * @param useIndexedDBCache: if true, will use IndexedDBCache to cache models and other artifacts.
+ * If false or unspecified, will use the Cache API. For more information of the two, see:
+ * https://developer.mozilla.org/en-US/docs/Web/API/Storage_API/Storage_quotas_and_eviction_criteria#what_technologies_store_data_in_the_browser
+ *
+ * @note Note that the Cache API is more well-tested in WebLLM as of now.
+ */
+export interface AppConfig {
+    model_list: Array<ModelRecord>;
+    useIndexedDBCache?: boolean;
+}
+/**
+ * modelVersion: the prebuilt model libraries that the current npm is compatible with, affects the
+ * `model_lib_url`s in `prebuiltAppConfig`.
+ *
+ * @note The model version does not have to match the npm version, since not each npm update
+ * requires an update of the model libraries.
+ */
+export declare const modelVersion = "v0_2_34";
+export declare const modelLibURLPrefix = "https://raw.githubusercontent.com/mlc-ai/binary-mlc-llm-libs/main/web-llm-models/";
+/**
+ * Default models and model library mapping to be used if unspecified.
+ *
+ * @note This is the only source of truth of which prebuilt model libraries are compatible with the
+ * current WebLLM npm version.
+ */
+export declare const prebuiltAppConfig: AppConfig;
+//# sourceMappingURL=config.d.ts.map
diff --git a/lib/config.d.ts.map b/lib/config.d.ts.map
diff --git a/lib/conversation.d.ts b/lib/conversation.d.ts
@@ -0,0 +1,48 @@
+import { ConvTemplateConfig, Role } from "./config";
+/**
+ * Helper to keep track of history conversations.
+ */
+export declare class Conversation {
+    messages: Array<[Role, string, string | undefined]>;
+    readonly config: ConvTemplateConfig;
+    function_string: string;
+    use_function_calling: boolean;
+    override_system_message?: string;
+    constructor(config: ConvTemplateConfig);
+    private getPromptArrayInternal;
+    /**
+     * Get prompt arrays with the first one as system.
+     *
+     * @returns The prompt array.
+     */
+    getPromptArray(): Array<string>;
+    /**
+     * Get the last round of prompt has not been fed as input.
+     *
+     * @note This function needs to be used with the assumption that
+     *       the caller call appendMessage then appendReplyHeader.
+     *
+     * @returns The prompt array.
+     */
+    getPrompArrayLastRound(): string[];
+    /**
+     * Resets all states for this.conversation.
+     */
+    reset(): void;
+    getStopStr(): string[];
+    getStopTokens(): number[];
+    appendMessage(role: Role, message: string, role_name?: string): void;
+    appendReplyHeader(role: Role): void;
+    finishReply(message: string): void;
+}
+export declare function getConversation(conv_template: string | ConvTemplateConfig, conv_config?: Partial<ConvTemplateConfig>): Conversation;
+/**
+ * Compare the states of two conversation instances. Equality is defined as their getPromptArray()
+ * should return the exact same things, which is determined by fields: messages, function_string,
+ * use_function_calling, and override_system_message.
+ *
+ * @returns True if `convA` equals to `convB`
+ * @note We assume convA and convB has the same `this.config`.
+ */
+export declare function compareConversationObject(convA: Conversation, convB: Conversation): boolean;
+//# sourceMappingURL=conversation.d.ts.map
diff --git a/lib/conversation.d.ts.map b/lib/conversation.d.ts.map
diff --git a/lib/engine.d.ts b/lib/engine.d.ts
@@ -0,0 +1,114 @@
+import * as API from "./openai_api_protocols/apis";
+import { ChatOptions, AppConfig, GenerationConfig, EngineConfig } from "./config";
+import { ChatCompletionRequest, ChatCompletion, ChatCompletionChunk, ChatCompletionFinishReason, ChatCompletionRequestNonStreaming, ChatCompletionRequestStreaming, ChatCompletionRequestBase } from "./openai_api_protocols/index";
+import { InitProgressCallback, EngineInterface, GenerateProgressCallback, LogitProcessor } from "./types";
+/**
+ * Creates `Engine`, and loads `modelId` onto WebGPU.
+ *
+ * Equivalent to `new webllm.Engine().reload(...)`.
+ *
+ * @param modelId The model to load, needs to either be in `webllm.prebuiltAppConfig`, or in
+ * `engineConfig.appConfig`.
+ * @param engineConfig Optionally configures the engine, see `webllm.EngineConfig`.
+ * @returns An intialized `WebLLM.Engine` with `modelId` loaded.
+ */
+export declare function CreateEngine(modelId: string, engineConfig?: EngineConfig): Promise<Engine>;
+/**
+ * The main interface of Engine, which loads a model and performs tasks.
+ *
+ * You can either initialize one with `webllm.CreateEngine(modelId)`, or `webllm.Engine().reload(modelId)`.
+ */
+export declare class Engine implements EngineInterface {
+    chat: API.Chat;
+    private currentModelId?;
+    private logger;
+    private logitProcessorRegistry?;
+    private logitProcessor?;
+    private pipeline?;
+    private initProgressCallback?;
+    private interruptSignal;
+    private deviceLostIsError;
+    private config?;
+    constructor();
+    setInitProgressCallback(initProgressCallback?: InitProgressCallback): void;
+    getInitProgressCallback(): InitProgressCallback | undefined;
+    setLogitProcessorRegistry(logitProcessorRegistry?: Map<string, LogitProcessor>): void;
+    reload(modelId: string, chatOpts?: ChatOptions, appConfig?: AppConfig): Promise<void>;
+    generate(input: string | ChatCompletionRequestNonStreaming, progressCallback?: GenerateProgressCallback, streamInterval?: number, genConfig?: GenerationConfig): Promise<string>;
+    private _generate;
+    /**
+     * Similar to `generate()`; but instead of using callback, we use an async iterable.
+     * @param request Request for chat completion.
+     * @param genConfig Generation config extraced from `request`.
+     */
+    chatCompletionAsyncChunkGenerator(request: ChatCompletionRequestStreaming, genConfig: GenerationConfig): AsyncGenerator<ChatCompletionChunk, void, void>;
+    /**
+     * Completes a single ChatCompletionRequest.
+     *
+     * @param request A OpenAI-style ChatCompletion request.
+     *
+     * @note For each choice (i.e. `n`), a request is defined by a single `prefill()` and mulitple
+     * `decode()`. This is important as it determines the behavior of various fields including `seed`.
+     */
+    chatCompletion(request: ChatCompletionRequestNonStreaming): Promise<ChatCompletion>;
+    chatCompletion(request: ChatCompletionRequestStreaming): Promise<AsyncIterable<ChatCompletionChunk>>;
+    chatCompletion(request: ChatCompletionRequestBase): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;
+    interruptGenerate(): Promise<void>;
+    runtimeStatsText(): Promise<string>;
+    resetChat(keepStats?: boolean): Promise<void>;
+    unload(): Promise<void>;
+    getMaxStorageBufferBindingSize(): Promise<number>;
+    getGPUVendor(): Promise<string>;
+    forwardTokensAndSample(inputIds: Array<number>, isPrefill: boolean): Promise<number>;
+    /**
+     * @returns Whether the generation stopped.
+     */
+    stopped(): boolean;
+    /**
+     * @returns Finish reason; undefined if generation not started/stopped yet.
+    */
+    getFinishReason(): ChatCompletionFinishReason | undefined;
+    /**
+     * Get the current generated response.
+     *
+     * @returns The current output message.
+     */
+    getMessage(): Promise<string>;
+    /**
+     * Get a new Conversation object based on the chat completion request.
+     *
+     * @param request The incoming ChatCompletionRequest
+     * @note `request.messages[-1]` is not included as it would be treated as a normal input to
+     * `prefill()`.
+     */
+    private getConversationFromChatCompletionRequest;
+    /**
+     * Returns the function string based on the request.tools and request.tool_choice, raises erros if
+     * encounter invalid request.
+     *
+     * @param request The chatCompletionRequest we are about to prefill for.
+     * @returns The string used to set Conversatoin.function_string
+     */
+    private getFunctionCallUsage;
+    /**
+     * Run a prefill step with a given input.
+     *
+     * If `input` is a chatCompletionRequest, we treat `input.messages[-1]` as the usual user input.
+     * We then convert `input.messages[:-1]` to a `Conversation` object, representing a conversation
+     * history.
+     *
+     * If the new `Conversation` object matches the current one loaded, it means we are
+     * performing multi-round chatting, so we do not reset, hence reusing KV cache. Otherwise, we
+     * reset every thing, treating the request as something completely new.
+     *
+     * @param input The input prompt, or `messages` in OpenAI-like APIs.
+     */
+    prefill(input: string | ChatCompletionRequest, genConfig?: GenerationConfig): Promise<void>;
+    /**
+     * Run a decode step to decode the next token.
+     */
+    decode(genConfig?: GenerationConfig): Promise<void>;
+    private getPipeline;
+    private asyncLoadTokenizer;
+}
+//# sourceMappingURL=engine.d.ts.map