Skip to content

Commit

Permalink
[Improvement] Quickly delete local or HDFS data at the shuffleId level.
Browse files Browse the repository at this point in the history
  • Loading branch information
yl09099 committed Aug 25, 2024
1 parent 57f0f8b commit 8a37d06
Show file tree
Hide file tree
Showing 10 changed files with 316 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ public void registerShuffle(
taskInfo.refreshLatestStageAttemptNumber(shuffleId, stageAttemptNumber);
try {
long start = System.currentTimeMillis();
shuffleServer.getShuffleTaskManager().removeShuffleDataSync(appId, shuffleId);
shuffleServer.getShuffleTaskManager().quickRemoveShuffleDataSync(appId, shuffleId);
LOG.info(
"Deleted the previous stage attempt data due to stage recomputing for app: {}, "
+ "shuffleId: {}. It costs {} ms",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -779,6 +779,16 @@ public boolean isAppExpired(String appId) {
* @param shuffleIds
*/
public void removeResourcesByShuffleIds(String appId, List<Integer> shuffleIds) {
removeResourcesByShuffleIds(appId, shuffleIds, false);
}

/**
* Clear up the partial resources of shuffleIds of App.
*
* @param appId
* @param shuffleIds
*/
public void removeResourcesByShuffleIds(String appId, List<Integer> shuffleIds, boolean isQuick) {
Lock writeLock = getAppWriteLock(appId);
writeLock.lock();
try {
Expand Down Expand Up @@ -811,7 +821,7 @@ public void removeResourcesByShuffleIds(String appId, List<Integer> shuffleIds)
withTimeoutExecution(
() -> {
storageManager.removeResources(
new ShufflePurgeEvent(appId, getUserByAppId(appId), shuffleIds));
new ShufflePurgeEvent(appId, getUserByAppId(appId), shuffleIds), isQuick);
return null;
},
storageRemoveOperationTimeoutSec,
Expand Down Expand Up @@ -998,6 +1008,16 @@ public void removeShuffleDataSync(String appId, int shuffleId) {
removeResourcesByShuffleIds(appId, Arrays.asList(shuffleId));
}

/**
* Delete all data under the shuffleId using the synchronous quick delete mode.
*
* @param appId
* @param shuffleId
*/
public void quickRemoveShuffleDataSync(String appId, int shuffleId) {
removeResourcesByShuffleIds(appId, Arrays.asList(shuffleId), true);
}

public ShuffleDataDistributionType getDataDistributionType(String appId) {
return shuffleTaskInfos.get(appId).getDataDistType();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.stream.Collectors;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Maps;
import com.google.common.collect.Queues;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
Expand All @@ -48,6 +50,7 @@
import org.apache.uniffle.storage.common.HadoopStorage;
import org.apache.uniffle.storage.common.Storage;
import org.apache.uniffle.storage.factory.ShuffleHandlerFactory;
import org.apache.uniffle.storage.handler.AsynchronousDeleteEvent;
import org.apache.uniffle.storage.handler.api.ShuffleDeleteHandler;
import org.apache.uniffle.storage.request.CreateShuffleDeleteHandlerRequest;
import org.apache.uniffle.storage.util.ShuffleStorageUtils;
Expand All @@ -64,12 +67,48 @@ public class HadoopStorageManager extends SingleStorageManager {
private Map<String, HadoopStorage> appIdToStorages = JavaUtils.newConcurrentMap();
private Map<String, HadoopStorage> pathToStorages = JavaUtils.newConcurrentMap();
private final boolean isStorageAuditLogEnabled;
private final BlockingQueue<AsynchronousDeleteEvent> quickNeedDeletePaths =
Queues.newLinkedBlockingQueue();
private Thread needDeletePathThread;

HadoopStorageManager(ShuffleServerConf conf) {
super(conf);
hadoopConf = conf.getHadoopConf();
shuffleServerId = conf.getString(ShuffleServerConf.SHUFFLE_SERVER_ID, "shuffleServerId");
isStorageAuditLogEnabled = conf.getBoolean(ShuffleServerConf.SERVER_STORAGE_AUDIT_LOG_ENABLED);
Runnable clearNeedDeletePath =
() -> {
while (true) {
AsynchronousDeleteEvent asynchronousDeleteEvent = null;
try {
asynchronousDeleteEvent = quickNeedDeletePaths.take();
ShuffleDeleteHandler deleteHandler =
ShuffleHandlerFactory.getInstance()
.createShuffleDeleteHandler(
new CreateShuffleDeleteHandlerRequest(
StorageType.HDFS.name(),
asynchronousDeleteEvent.getConf(),
shuffleServerId));
deleteHandler.delete(
asynchronousDeleteEvent.getNeedDeleteRenamePaths(),
asynchronousDeleteEvent.getAppId(),
asynchronousDeleteEvent.getUser());

} catch (Exception e) {
if (asynchronousDeleteEvent != null) {
LOG.error(
"Delete Path {} failed.",
asynchronousDeleteEvent.getNeedDeleteRenamePaths(),
e);
} else {
LOG.error("Failed to delete a directory in clearNeedDeleteHadoopPathThread", e);
}
}
}
};
needDeletePathThread = new Thread(clearNeedDeletePath);
needDeletePathThread.setName("clearNeedDeleteHadoopPathThread");
needDeletePathThread.setDaemon(true);
}

@Override
Expand Down Expand Up @@ -98,6 +137,11 @@ public Storage selectStorage(ShuffleDataReadEvent event) {

@Override
public void removeResources(PurgeEvent event) {
removeResources(event, false);
}

@Override
public void removeResources(PurgeEvent event, boolean isQuick) {
String appId = event.getAppId();
HadoopStorage storage = getStorageByAppId(appId);
if (storage != null) {
Expand Down Expand Up @@ -148,7 +192,19 @@ public void removeResources(PurgeEvent event) {
storage.getStoragePath()));
}
}
deleteHandler.delete(deletePaths.toArray(new String[0]), appId, event.getUser());
if (isQuick) {
AsynchronousDeleteEvent asynchronousDeleteEvent =
new AsynchronousDeleteEvent(
appId, event.getUser(), storage.getConf(), event.getShuffleIds(), deletePaths);
deleteHandler.quickDelete(asynchronousDeleteEvent);
boolean isSucess = quickNeedDeletePaths.offer(asynchronousDeleteEvent);
if (!isSucess) {
LOG.warn(
"Remove the case where the clearNeedDeleteHadoopPathThread queue is full and cannot accept elements.");
}
} else {
deleteHandler.delete(deletePaths.toArray(new String[0]), appId, event.getUser());
}
removeAppStorageInfo(event);
} else {
LOG.warn("Storage gotten is null when removing resources for event: {}", event);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,13 @@ public Map<String, StorageInfo> getStorageInfo() {
}

public void removeResources(PurgeEvent event) {
removeResources(event, false);
}

public void removeResources(PurgeEvent event, boolean isQuick) {
LOG.info("Start to remove resource of {}", event);
warmStorageManager.removeResources(event);
coldStorageManager.removeResources(event);
warmStorageManager.removeResources(event, isQuick);
coldStorageManager.removeResources(event, isQuick);
}

public StorageManager getColdStorageManager() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import java.util.Optional;
import java.util.ServiceLoader;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
Expand All @@ -40,6 +41,7 @@
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Queues;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
Expand Down Expand Up @@ -68,6 +70,7 @@
import org.apache.uniffle.storage.common.Storage;
import org.apache.uniffle.storage.common.StorageMediaProvider;
import org.apache.uniffle.storage.factory.ShuffleHandlerFactory;
import org.apache.uniffle.storage.handler.AsynchronousDeleteEvent;
import org.apache.uniffle.storage.handler.api.ShuffleDeleteHandler;
import org.apache.uniffle.storage.request.CreateShuffleDeleteHandlerRequest;
import org.apache.uniffle.storage.util.ShuffleStorageUtils;
Expand All @@ -90,6 +93,9 @@ public class LocalStorageManager extends SingleStorageManager {
private final List<StorageMediaProvider> typeProviders = Lists.newArrayList();

private final boolean isStorageAuditLogEnabled;
private final BlockingQueue<AsynchronousDeleteEvent> quickNeedDeletePaths =
Queues.newLinkedBlockingQueue();
private Thread needDeletePathThread;

@VisibleForTesting
LocalStorageManager(ShuffleServerConf conf) {
Expand Down Expand Up @@ -175,6 +181,37 @@ public class LocalStorageManager extends SingleStorageManager {
localStorages.stream().map(LocalStorage::getBasePath).collect(Collectors.toList())));
this.checker = new LocalStorageChecker(conf, localStorages);
isStorageAuditLogEnabled = conf.getBoolean(ShuffleServerConf.SERVER_STORAGE_AUDIT_LOG_ENABLED);

Runnable clearNeedDeletePath =
() -> {
while (true) {
AsynchronousDeleteEvent asynchronousDeleteEvent = null;
ShuffleDeleteHandler deleteHandler =
ShuffleHandlerFactory.getInstance()
.createShuffleDeleteHandler(
new CreateShuffleDeleteHandlerRequest(
StorageType.LOCALFILE.name(), new Configuration()));
try {
asynchronousDeleteEvent = quickNeedDeletePaths.take();
deleteHandler.delete(
asynchronousDeleteEvent.getNeedDeleteRenamePaths(),
asynchronousDeleteEvent.getAppId(),
asynchronousDeleteEvent.getUser());
} catch (Exception e) {
if (asynchronousDeleteEvent != null) {
LOG.error(
"Delete Path {} failed.",
asynchronousDeleteEvent.getNeedDeleteRenamePaths(),
e);
} else {
LOG.error("Failed to delete a directory in clearNeedDeleteHadoopPathThread", e);
}
}
}
};
needDeletePathThread = new Thread(clearNeedDeletePath);
needDeletePathThread.setName("clearNeedDeleteLocalPathThread");
needDeletePathThread.setDaemon(true);
}

private StorageMedia getStorageTypeForBasePath(String basePath) {
Expand Down Expand Up @@ -266,6 +303,11 @@ public Checker getStorageChecker() {

@Override
public void removeResources(PurgeEvent event) {
removeResources(event, false);
}

@Override
public void removeResources(PurgeEvent event, boolean isQuick) {
String appId = event.getAppId();
String user = event.getUser();
List<Integer> shuffleSet =
Expand Down Expand Up @@ -327,8 +369,19 @@ public void removeResources(PurgeEvent event) {
}
})
.collect(Collectors.toList());

deleteHandler.delete(deletePaths.toArray(new String[deletePaths.size()]), appId, user);
if (isQuick) {
AsynchronousDeleteEvent asynchronousDeleteEvent =
new AsynchronousDeleteEvent(
appId, event.getUser(), null, event.getShuffleIds(), deletePaths);
deleteHandler.quickDelete(asynchronousDeleteEvent);
boolean isSucess = quickNeedDeletePaths.offer(asynchronousDeleteEvent);
if (!isSucess) {
LOG.warn(
"Remove the case where the clearNeedDeleteHadoopPathThread queue is full and cannot accept elements.");
}
} else {
deleteHandler.delete(deletePaths.toArray(new String[deletePaths.size()]), appId, user);
}
removeAppStorageInfo(event);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ public interface StorageManager {

void removeResources(PurgeEvent event);

void removeResources(PurgeEvent event, boolean isQuick);

void start();

void stop();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.uniffle.storage.handler;

import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;

public class AsynchronousDeleteEvent {
private static final String TEMPORARYSUFFIX = "_tmp";
private String appId;
private String user;
private List<Integer> shuffleIds;
private Configuration conf;
/** Records the mapping between the path to be deleted and the path to be renamed. */
private Map<String, String> needDeletePathAndRenamePath;

public AsynchronousDeleteEvent(
String appId,
String user,
Configuration conf,
List<Integer> shuffleIds,
List<String> needDeletePath) {
this.appId = appId;
this.user = user;
this.shuffleIds = shuffleIds;
this.conf = conf;
this.needDeletePathAndRenamePath =
needDeletePath.stream()
.collect(
Collectors.toMap(Function.identity(), s -> StringUtils.join(s, TEMPORARYSUFFIX)));
}

public String getAppId() {
return appId;
}

public String getUser() {
return user;
}

public List<Integer> getShuffleIds() {
return shuffleIds;
}

public Configuration getConf() {
return conf;
}

public Map<String, String> getNeedDeletePathAndRenamePath() {
return needDeletePathAndRenamePath;
}

public String[] getNeedDeleteRenamePaths() {
return needDeletePathAndRenamePath.values().stream().toArray(String[]::new);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

package org.apache.uniffle.storage.handler.api;

import org.apache.uniffle.storage.handler.AsynchronousDeleteEvent;

public interface ShuffleDeleteHandler {

/**
Expand All @@ -25,4 +27,7 @@ public interface ShuffleDeleteHandler {
* @param appId ApplicationId for delete
*/
void delete(String[] storageBasePaths, String appId, String user);

/** Rename the file and then delete it asynchronously. */
void quickDelete(AsynchronousDeleteEvent shuffleQuickPurgeEvent);
}
Loading

0 comments on commit 8a37d06

Please sign in to comment.