Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support ZIP-files without info.csv and make language detection more lax #1174

Merged
merged 5 commits into from
Jul 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cli/src/cli/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,10 @@ export async function run(locations: string[], options: RunOptions): Promise<voi
});
const report = await dolos.analyzePaths(locations);

if (report.warnings.length > 0) {
report.warnings.forEach(warn => warning(warn));
}

const view = closestMatch(options.outputFormat, {
"terminal": () => new TerminalView(report, options),
"console": () => new TerminalView(report, options),
Expand Down
85 changes: 63 additions & 22 deletions lib/src/dolos.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,11 @@ import { Result } from "./lib/util/result";
import { csvParse, DSVRowString } from "d3-dsv";
import * as path from "path";
import { Tokenizer } from "./lib/tokenizer/tokenizer";
import { default as fsWithCallbacks, constants } from "fs";
import { constants, default as fsWithCallbacks } from "fs";
import { spawnSync as spawn } from "child_process";
import { tmpdir } from "os";
import {
Language,
LanguagePicker
} from "./lib/util/language";
import { Language, LanguagePicker } from "./lib/util/language";

const fs = fsWithCallbacks.promises;


Expand All @@ -30,6 +28,25 @@ export class Dolos {
this.options = new Options(customOptions);
}

private async fromDirectory(dirPath: string): Promise<Result<File[]>> {
const dirs = [dirPath];
const files = [];

let i = 0;

while(i < dirs.length) {
for (const entry of await fs.readdir(dirs[i], { withFileTypes: true })) {
if (entry.isDirectory()) {
dirs.push(path.join(dirs[i], entry.name));
} else if (entry.isFile()) {
files.push(File.fromPath(path.join(dirs[i], entry.name)));
}
}
i += 1;
}

return await Result.all(files);
}

private async fromZIP(zipPath: string): Promise<Result<File[]>> {
const tmpDir = await fs.mkdtemp(path.join(tmpdir(), "dolos-unzip-"));
Expand All @@ -41,12 +58,11 @@ export class Dolos {
throw new Error(`Unzipping failed with exit status ${ status }, stderr: \n${stderr}`);
}
const infoPath = path.join(tmpDir, "info.csv");
try {
await fs.access(infoPath, constants.R_OK);
} catch {
throw new Error("Zip does not contain a required 'info.csv' file");
if (await fs.access(infoPath, constants.R_OK).then(() => true).catch(() => false)) {
return await this.fromCSV(infoPath);
} else {
return await this.fromDirectory(tmpDir);
}
return await this.fromCSV(infoPath);
} finally {
await fs.rm(tmpDir, { recursive: true });
}
Expand All @@ -66,7 +82,7 @@ export class Dolos {
nameNL: row.name_nl as string,
exerciseID: row.exercise_id as string,
createdAt: new Date(row.created_at as string),
labels: row.labels as string
labels: row.label as string || row.labels as string
}))
.map((row: ExtraInfo) => File.fromPath(path.join(dirname, row.filename), row));
return await Result.all(files);
Expand Down Expand Up @@ -106,14 +122,7 @@ export class Dolos {
nameCandidate?: string
): Promise<Report> {

if (files.length < 2) {
throw new Error("You need to supply at least two files");
} else if (files.length == 2 && this.options.maxFingerprintPercentage !== null) {
throw new Error("You have given a maximum hash percentage but your are " +
"comparing two files. Each matching hash will thus " +
"be present in 100% of the files. This option does only" +
"make sense when comparing more than two files.");
} else if (this.index == null) {
if (this.index == null) {
if (this.options.language) {
this.language = this.languagePicker.findLanguage(this.options.language);
} else {
Expand All @@ -123,11 +132,43 @@ export class Dolos {
this.tokenizer = this.language.createTokenizer();
this.index = new Index(this.tokenizer, this.options);
}

const warnings = [];
let filteredFiles;
if (this.languageDetected) {
for (const file of files) {
this.language?.checkLanguage(file);
filteredFiles = files.filter(file => this.language?.extensionMatches(file.path));
const diff = files.length - filteredFiles.length;
if (diff > 0) {
warnings.push(
`The language of the files was detected as ${this.language?.name} ` +
`but ${diff} files were ignored because they did not have a matching extension.` +
"You can override this behavior by setting the language explicitly."
);
}
} else {
filteredFiles = files;
}

if (files.length < 2) {
throw new Error("You need to supply at least two files");
} else if (files.length == 2 && this.options.maxFingerprintPercentage !== null) {
throw new Error("You have given a maximum hash percentage but your are " +
"comparing two files. Each matching hash will thus " +
"be present in 100% of the files. This option does only" +
"make sense when comparing more than two files.");
}
return this.index.compareFiles(files, nameCandidate);

// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
const tokenizedFiles = filteredFiles.map(f => this.tokenizer!.tokenizeFile(f));
const fingerprints = await this.index.createMatches(tokenizedFiles);

return new Report(
this.options,
this.language,
tokenizedFiles,
fingerprints,
nameCandidate,
warnings
);
}
}
3 changes: 3 additions & 0 deletions lib/src/lib/analyze/report.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ type Hash = number;
export interface Metadata extends DolosOptions {
languageDetected: boolean;
createdAt: string;
warnings: string[];
}

export class Report {
Expand All @@ -28,6 +29,7 @@ export class Report {
public readonly files: TokenizedFile[],
fingerprints: Map<Hash, SharedFingerprint>,
name?: string,
public readonly warnings: string[] = [],
) {
if (this.options.maxFingerprintCount != null) {
this.kgramMaxFileOccurrences = this.options.maxFingerprintCount;
Expand Down Expand Up @@ -87,6 +89,7 @@ export class Report {
createdAt: this.createdAt,
language: this.language?.name ?? null,
languageDetected: this.options.language == undefined,
warnings: this.warnings,
};
}
}
28 changes: 18 additions & 10 deletions lib/src/lib/util/language.ts
Original file line number Diff line number Diff line change
Expand Up @@ -132,23 +132,31 @@ export class LanguagePicker {
}

/**
* Find the language to use for tokenization based on the extension of the
* first file. If the extension does not match any known language, then
* a LanguageError is thrown.
* Find the language to use for tokenization based on the most common
* extension of the files. If the extension does not match any known language,
* then a LanguageError is thrown.
*
* @param files the files to tokenize
*/
public detectLanguage(files: File[]): Language {
const firstFile = files[0];
const language = this.byExtension.get(firstFile.extension);
if (language == null) {
const counts: Map<string, number> = new Map();
let maxCount = 0;
let language: Language | undefined = undefined;
for (const file of files) {
const count = (counts.get(file.extension) ?? 0) + 1;
if (count > maxCount) {
maxCount = count;
language = this.byExtension.get(file.extension);
}
counts.set(file.extension, count);
}

if (language == undefined) {
throw new LanguageError(
`Could not detect language based on extension (${firstFile.extension}).`
"Could not detect language based on extension."
);
}
for (const file of files) {
language.checkLanguage(file);
}

return language;
}

Expand Down
33 changes: 32 additions & 1 deletion lib/src/test/dolos.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ test("should read CSV-files", async t => {
t.true(pairs[0].similarity > 0.75);
});

test("should read ZIP-files", async t => {
test("should read ZIP-files with info.csv", async t => {
const dolos = new Dolos();

const report = await dolos.analyzePaths(["../samples/javascript/simple-dataset.zip"]);
Expand All @@ -249,6 +249,21 @@ test("should read ZIP-files", async t => {
t.true(pairs[0].similarity > 0.75);
});

test("should read ZIP-files without info.csv", async t => {
const dolos = new Dolos();

const report = await dolos.analyzePaths(["../samples/javascript/simple-dataset-no-csv.zip"]);

t.is(4, report.files.length);
t.is(report.name, "simple-dataset-no-csv");
t.is(report.metadata()["reportName"], "simple-dataset-no-csv");
t.is(report.metadata()["warnings"].length, 1);

const pairs = report.allPairs();
t.is(6, pairs.length);
t.true(pairs[0].similarity > 0.75);
});

test("empty files should match 0%", async t => {
const dolos = new Dolos();
const report = await dolos.analyze([new File("file1.js", ""), new File("file2.js", "")]);
Expand All @@ -257,3 +272,19 @@ test("empty files should match 0%", async t => {
t.is(0, pairs[0].overlap);
t.is(0, pairs[0].longest);
});

test("should generate warning when not all files match detected language", async t => {
const dolos = new Dolos();

const report = await dolos.analyzePaths([
"../samples/javascript/sample.js",
"../samples/javascript/copied_function.js",
"../samples/java/Caesar.java"
]);

t.is(report.metadata()["warnings"].length, 1);
t.is(2, report.files.length);

const pairs = report.allPairs();
t.is(1, pairs.length);
});
6 changes: 4 additions & 2 deletions lib/src/test/tokenizer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ test("language picker should throw an error for unknown extension", t => {
t.throws(() => new LanguagePicker().detectLanguage([new File("unknown.extension", "")]));
});

test("language picker should throw an error for different languages", t => {
t.throws(() => new LanguagePicker().detectLanguage([new File("file.py", ""), new File("file.js", "")]));
test("language picker should detect most common language", t => {
const files = [new File("file.py", ""), new File("otherfile.py", ""), new File("file.js", "")];
const detected = new LanguagePicker().detectLanguage(files);
t.deepEqual(detected.name, "python");
});
Binary file added samples/javascript/simple-dataset-no-csv.zip
Binary file not shown.