Skip to content

Commit

Permalink
Merge pull request #336 from huy-trn/good_for_rag
Browse files Browse the repository at this point in the history
Code compression
  • Loading branch information
yamadashy authored Feb 16, 2025
2 parents acc4121 + 169ea22 commit 1f610a5
Show file tree
Hide file tree
Showing 33 changed files with 1,076 additions and 6 deletions.
49 changes: 49 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,15 @@ repomix --remote https://github.com/yamadashy/repomix/commit/836abcd7335137228ad

```

To compress the output:

```bash
repomix --compress

# You can also use it with remote repositories:
repomix --remote yamaadshy/repomix --compress
```

To initialize a new configuration file (`repomix.config.json`):

```bash
Expand Down Expand Up @@ -414,6 +423,7 @@ This format provides a clean, readable structure that is both human-friendly and
- `-o, --output <file>`: Specify the output file name
- `--style <style>`: Specify the output style (`plain`, `xml`, `markdown`)
- `--parsable-style`: Enable parsable output based on the chosen style schema. Note that this can increase token count.
- `--compress`: Perform intelligent code extraction, focusing on essential function and class signatures to reduce token count
- `--output-show-line-numbers`: Show line numbers in the output
- `--copy`: Additionally copy generated output to system clipboard
- `--no-file-summary`: Disable file summary section output
Expand Down Expand Up @@ -511,6 +521,43 @@ repomix --remote https://github.com/yamadashy/repomix --remote-branch 935b695
repomix --remote https://github.com/yamadashy/repomix/commit/836abcd7335137228ad77feb28655d85712680f1
```

### Code Compression

The `--compress` option utilizes tree-sitter to perform intelligent code extraction, focusing on essential function and class signatures while removing implementation details. This can help reduce token count while retaining important structural information.

```bash
repomix --compress
```

For example, this code:

```typescript
const calculateTotal = (items: ShoppingItem[]) => {
let total = 0;
for (const item of items) {
total += item.price * item.quantity;
}
return total;
}
interface Item {
name: string;
price: number;
quantity: number;
}
```

Will be compressed to:

```typescript
const calculateTotal = (items: ShoppingItem[]) => {
interface Item {
```
> **Note**
> Currently, compression is supported for these languages: TypeScript/JavaScript, Python, Ruby, Java, Go, C#, C/C++, PHP, and Rust.
## ⚙️ Configuration
Create a `repomix.config.json` file in your project root for custom configurations.
Expand All @@ -526,6 +573,7 @@ Here's an explanation of the configuration options:
| `output.filePath` | The name of the output file | `"repomix-output.txt"` |
| `output.style` | The style of the output (`plain`, `xml`, `markdown`) | `"plain"` |
| `output.parsableStyle` | Whether to escape the output based on the chosen style schema. Note that this can increase token count. | `false` |
| `output.compress` | Whether to perform intelligent code extraction to reduce token count | `false` |
| `output.headerText` | Custom text to include in the file header | `null` |
| `output.instructionFilePath` | Path to a file containing detailed custom instructions | `null` |
| `output.fileSummary` | Whether to include a summary section at the beginning of the output | `true` |
Expand All @@ -551,6 +599,7 @@ Example configuration:
"filePath": "repomix-output.xml",
"style": "xml",
"parsableStyle": true,
"compress": false,
"headerText": "Custom header information for the packed file.",
"fileSummary": true,
"directoryStructure": true,
Expand Down
17 changes: 17 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@
"strip-comments": "^2.0.1",
"strip-json-comments": "^5.0.1",
"tiktoken": "^1.0.19",
"tree-sitter-wasms": "^0.1.12",
"web-tree-sitter": "^0.24.7",
"zod": "^3.24.1"
},
"devDependencies": {
Expand Down
1 change: 1 addition & 0 deletions repomix.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"output": {
"filePath": "repomix-output.xml",
"style": "xml",
"compress": false,
"headerText": "This repository contains the source code for the Repomix tool.\nRepomix is designed to pack repository contents into a single file,\nmaking it easier for AI systems to analyze and process the codebase.\n\nKey Features:\n- Configurable ignore patterns\n- Custom header text support\n- Efficient file processing and packing\n\nPlease refer to the README.md file for more detailed information on usage and configuration.\n",
"instructionFilePath": "repomix-instruction.md",
"fileSummary": true,
Expand Down
7 changes: 6 additions & 1 deletion src/cli/actions/defaultAction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import { type PackResult, pack } from '../../core/packager.js';
import { rethrowValidationErrorIfZodError } from '../../shared/errorHandle.js';
import { logger } from '../../shared/logger.js';
import { printCompletion, printSecurityCheck, printSummary, printTopFiles } from '../cliPrint.js';
import Spinner from '../cliSpinner.js';
import { Spinner } from '../cliSpinner.js';
import type { CliOptions } from '../types.js';
import { runMigrationAction } from './migrationAction.js';

Expand Down Expand Up @@ -166,6 +166,11 @@ const buildCliConfig = (options: CliOptions): RepomixConfigCli => {
if (options.headerText !== undefined) {
cliConfig.output = { ...cliConfig.output, headerText: options.headerText };
}

if (options.compress !== undefined) {
cliConfig.output = { ...cliConfig.output, compress: options.compress };
}

if (options.tokenCountEncoding) {
cliConfig.tokenCount = { encoding: options.tokenCountEncoding };
}
Expand Down
2 changes: 1 addition & 1 deletion src/cli/actions/remoteAction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import pc from 'picocolors';
import { execGitShallowClone, isGitInstalled } from '../../core/file/gitCommand.js';
import { RepomixError } from '../../shared/errorHandle.js';
import { logger } from '../../shared/logger.js';
import Spinner from '../cliSpinner.js';
import { Spinner } from '../cliSpinner.js';
import type { CliOptions } from '../types.js';
import { type DefaultActionRunnerResult, runDefaultAction } from './defaultAction.js';
interface IGitUrl extends GitUrl {
Expand Down
1 change: 1 addition & 0 deletions src/cli/cliRun.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ export const run = async () => {
.option('--no-directory-structure', 'disable directory structure section output')
.option('--remove-comments', 'remove comments')
.option('--remove-empty-lines', 'remove empty lines')
.option('--compress', 'perform code compression to reduce token count')
.addOption(new Option('--verbose', 'enable verbose logging for detailed output').conflicts('quiet'))
.addOption(new Option('--quiet', 'disable all output to stdout').conflicts('verbose'))
.option('--init', 'initialize a new repomix.config.json file')
Expand Down
4 changes: 1 addition & 3 deletions src/cli/cliSpinner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import logUpdate from 'log-update';
import pc from 'picocolors';
import type { CliOptions } from './types.js';

class Spinner {
export class Spinner {
private spinner = cliSpinners.dots;
private message: string;
private currentFrame = 0;
Expand Down Expand Up @@ -67,5 +67,3 @@ class Spinner {
this.stop(`${pc.red('✖')} ${message}`);
}
}

export default Spinner;
2 changes: 2 additions & 0 deletions src/config/configSchema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ export const repomixConfigBaseSchema = z.object({
directoryStructure: z.boolean().optional(),
removeComments: z.boolean().optional(),
removeEmptyLines: z.boolean().optional(),
compress: z.boolean().optional(),
topFilesLength: z.number().optional(),
showLineNumbers: z.boolean().optional(),
copyToClipboard: z.boolean().optional(),
Expand Down Expand Up @@ -64,6 +65,7 @@ export const repomixConfigDefaultSchema = z.object({
directoryStructure: z.boolean().default(true),
removeComments: z.boolean().default(false),
removeEmptyLines: z.boolean().default(false),
compress: z.boolean().default(false),
topFilesLength: z.number().int().min(0).default(5),
showLineNumbers: z.boolean().default(false),
copyToClipboard: z.boolean().default(false),
Expand Down
16 changes: 15 additions & 1 deletion src/core/file/workers/fileProcessWorker.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import type { RepomixConfigMerged } from '../../../config/configSchema.js';
import { logger } from '../../../shared/logger.js';
import { parseFile } from '../../tree-sitter/parseFile.js';
import { getFileManipulator } from '../fileManipulate.js';
import type { ProcessedFile, RawFile } from '../fileTypes.js';

Expand Down Expand Up @@ -33,7 +34,20 @@ export const processContent = async (rawFile: RawFile, config: RepomixConfigMerg

processedContent = processedContent.trim();

if (config.output.showLineNumbers) {
if (config.output.compress) {
try {
const parsedContent = await parseFile(processedContent, rawFile.path, config);
if (parsedContent === undefined) {
logger.trace(`Failed to parse ${rawFile.path} in compressed mode. Using original content.`);
}
processedContent = parsedContent ?? processedContent;
} catch (error: unknown) {
const message = error instanceof Error ? error.message : String(error);
logger.error(`Error parsing ${rawFile.path} in compressed mode: ${message}`);
//re-throw error
throw error;
}
} else if (config.output.showLineNumbers) {
const lines = processedContent.split('\n');
const padding = lines.length.toString().length;
const numberedLines = lines.map((line, i) => `${(i + 1).toString().padStart(padding)}: ${line}`);
Expand Down
23 changes: 23 additions & 0 deletions src/core/tree-sitter/ext2Lang.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/**
* @see https://unpkg.com/browse/tree-sitter-wasms@latest/out/
*/
export const ext2Lang = {
vue: 'javascript',
cjs: 'javascript',
js: 'javascript',
jsx: 'javascript',
ts: 'typescript',
tsx: 'typescript',
h: 'c',
c: 'c',
hpp: 'cpp',
cpp: 'cpp',
py: 'python',
rs: 'rust',
java: 'java',
go: 'go',
cs: 'c_sharp',
rb: 'ruby',
php: 'php',
swift: 'swift',
};
29 changes: 29 additions & 0 deletions src/core/tree-sitter/lang2Query.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import { queryC } from './queries/c.js';
import { queryCSharp } from './queries/cSharp.js';
import { queryCpp } from './queries/cpp.js';
import { queryGo } from './queries/go.js';
import { queryJava } from './queries/java.js';
import { queryJavascript } from './queries/javascript.js';
import { queryPhp } from './queries/php.js';
import { queryPython } from './queries/python.js';
import { queryRuby } from './queries/ruby.js';
import { queryRust } from './queries/rust.js';
import { querySwift } from './queries/swift.js';
import { queryTypescript } from './queries/typescript.js';

export const lang2Query = {
javascript: queryJavascript,
typescript: queryTypescript,
c: queryC,
cpp: queryCpp,
python: queryPython,
rust: queryRust,
go: queryGo,
c_sharp: queryCSharp,
ruby: queryRuby,
java: queryJava,
php: queryPhp,
swift: querySwift,
};

export type SupportedLang = keyof typeof lang2Query;
67 changes: 67 additions & 0 deletions src/core/tree-sitter/languageParser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import * as path from 'node:path';
import Parser from 'web-tree-sitter';

import { RepomixError } from '../../shared/errorHandle.js';
import { ext2Lang } from './ext2Lang.js';
import { type SupportedLang, lang2Query } from './lang2Query.js';
import { loadLanguage } from './loadLanguage.js';

export class LanguageParser {
private loadedParsers: {
[key: string]: Parser;
} = {};

private loadedQueries: {
[key: string]: Parser.Query;
} = {};

private getFileExtension(filePath: string) {
return path.extname(filePath).toLowerCase().slice(1);
}

private async prepareLang(name: SupportedLang) {
try {
const lang = await loadLanguage(name);
const parser = new Parser();
parser.setLanguage(lang);
this.loadedParsers[name] = parser;
this.loadedQueries[name] = lang.query(lang2Query[name]);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
throw new RepomixError(`Failed to prepare language ${name}: ${message}`);
}
}
// 'name' is name of the language
public async getParserForLang(name: SupportedLang) {
if (!this.loadedParsers[name]) {
await this.prepareLang(name);
}
return this.loadedParsers[name];
}

// 'name' is name of the language
public async getQueryForLang(name: SupportedLang) {
if (!this.loadedQueries[name]) {
await this.prepareLang(name);
}
return this.loadedQueries[name];
}

public guessTheLang(filePath: string): SupportedLang | undefined {
const ext = this.getFileExtension(filePath);
if (!Object.keys(ext2Lang).includes(ext)) {
return undefined;
}
const lang = ext2Lang[ext as keyof typeof ext2Lang] as SupportedLang;
return lang;
}

public async init() {
try {
await Parser.init();
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
throw new Error(`Failed to initialize parser: ${message}`);
}
}
}
30 changes: 30 additions & 0 deletions src/core/tree-sitter/loadLanguage.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import fs from 'node:fs/promises';
import { createRequire } from 'node:module';
import path from 'node:path';
import Parser from 'web-tree-sitter';

const require = createRequire(import.meta.url);

export async function loadLanguage(langName: string): Promise<Parser.Language> {
if (!langName) {
throw new Error('Invalid language name');
}

try {
const wasmPath = await getWasmPath(langName);
return await Parser.Language.load(wasmPath);
} catch (error: unknown) {
const message = error instanceof Error ? error.message : String(error);
throw new Error(`Failed to load language ${langName}: ${message}`);
}
}

async function getWasmPath(langName: string): Promise<string> {
const wasmPath = require.resolve(`tree-sitter-wasms/out/tree-sitter-${langName}.wasm`);
try {
await fs.access(wasmPath);
return wasmPath;
} catch {
throw new Error(`WASM file not found for language ${langName}: ${wasmPath}`);
}
}
Loading

0 comments on commit 1f610a5

Please sign in to comment.